summaryrefslogtreecommitdiff
path: root/chromium/third_party/libyuv
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2021-05-20 09:47:09 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2021-06-07 11:15:42 +0000
commit189d4fd8fad9e3c776873be51938cd31a42b6177 (patch)
tree6497caeff5e383937996768766ab3bb2081a40b2 /chromium/third_party/libyuv
parent8bc75099d364490b22f43a7ce366b366c08f4164 (diff)
downloadqtwebengine-chromium-189d4fd8fad9e3c776873be51938cd31a42b6177.tar.gz
BASELINE: Update Chromium to 90.0.4430.221
Change-Id: Iff4d9d18d2fcf1a576f3b1f453010f744a232920 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/libyuv')
-rw-r--r--chromium/third_party/libyuv/DEPS34
-rw-r--r--chromium/third_party/libyuv/README.chromium2
-rw-r--r--chromium/third_party/libyuv/include/libyuv.h1
-rw-r--r--chromium/third_party/libyuv/include/libyuv/planar_functions.h10
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale_row.h1
-rw-r--r--chromium/third_party/libyuv/include/libyuv/version.h2
-rw-r--r--chromium/third_party/libyuv/source/compare_gcc.cc344
-rw-r--r--chromium/third_party/libyuv/source/compare_neon.cc70
-rw-r--r--chromium/third_party/libyuv/source/compare_neon64.cc72
-rw-r--r--chromium/third_party/libyuv/source/cpu_id.cc4
-rw-r--r--chromium/third_party/libyuv/source/planar_functions.cc33
-rw-r--r--chromium/third_party/libyuv/source/rotate_gcc.cc540
-rw-r--r--chromium/third_party/libyuv/source/rotate_neon.cc218
-rw-r--r--chromium/third_party/libyuv/source/rotate_neon64.cc292
-rw-r--r--chromium/third_party/libyuv/source/row_gcc.cc5869
-rw-r--r--chromium/third_party/libyuv/source/row_neon.cc2254
-rw-r--r--chromium/third_party/libyuv/source/row_neon64.cc2755
-rw-r--r--chromium/third_party/libyuv/source/scale_any.cc7
-rw-r--r--chromium/third_party/libyuv/source/scale_common.cc4
-rw-r--r--chromium/third_party/libyuv/source/scale_gcc.cc1397
-rw-r--r--chromium/third_party/libyuv/source/scale_neon.cc717
-rw-r--r--chromium/third_party/libyuv/source/scale_neon64.cc931
-rw-r--r--chromium/third_party/libyuv/source/scale_uv.cc27
-rw-r--r--chromium/third_party/libyuv/unit_test/planar_test.cc64
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_argb_test.cc15
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_test.cc201
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_uv_test.cc15
-rw-r--r--chromium/third_party/libyuv/winarm.mk1
28 files changed, 8150 insertions, 7730 deletions
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS
index 3558eb56c71..de185434500 100644
--- a/chromium/third_party/libyuv/DEPS
+++ b/chromium/third_party/libyuv/DEPS
@@ -1,24 +1,18 @@
-gclient_gn_args_file = 'src/build/config/gclient_args.gni'
-gclient_gn_args = [
- 'mac_xcode_version',
-]
-
vars = {
'chromium_git': 'https://chromium.googlesource.com',
- 'chromium_revision': '5aaa70b53c3cbb80a3bcf11d17c6806729f65a8b',
+ 'chromium_revision': '64c8c30faaf969c15c028131dfcd0819208039c1',
'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94',
- 'mac_xcode_version': 'default',
}
deps = {
'src/build':
- Var('chromium_git') + '/chromium/src/build' + '@' + 'bddddb36e86375a17ccb10ab243a7c19c686268b',
+ Var('chromium_git') + '/chromium/src/build' + '@' + '2d2f9f2b85592bb9af5753ef300c055e6feb709f',
'src/buildtools':
- Var('chromium_git') + '/chromium/src/buildtools' + '@' + '98881a1297863de584fad20fb671e8c44ad1a7d0',
+ Var('chromium_git') + '/chromium/src/buildtools' + '@' + '6302c1175607a436e18947a5abe9df2209e845fc',
'src/testing':
- Var('chromium_git') + '/chromium/src/testing' + '@' + '7ade79a849fc4988e4418161920a410cb3b9f5ae',
+ Var('chromium_git') + '/chromium/src/testing' + '@' + '40b44171056045ed1f85ca0b57485e46c03d7867',
'src/third_party':
- Var('chromium_git') + '/chromium/src/third_party' + '@' + '0e4b552d799aa8978d441662e51bbd8da69a4f71',
+ Var('chromium_git') + '/chromium/src/third_party' + '@' + '24ccdf9b7553446791983bf357261c5e0a4314a0',
'src/buildtools/linux64': {
'packages': [
@@ -61,13 +55,13 @@ deps = {
Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'd999d54f4bca789543a2eb6c995af2d9b5a1f3ed',
'src/third_party/catapult':
- Var('chromium_git') + '/catapult.git' + '@' + '7030291356e4ba8dd8a59c7a632764e5b8238a14',
+ Var('chromium_git') + '/catapult.git' + '@' + 'ccc9dd2835f5a7c5c82ae3c1a2fbc2fe2fd9dfd1',
'src/third_party/colorama/src':
Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
'src/third_party/depot_tools':
- Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + 'c7be37e121739c84b8dad04a23434e02177b9b2d',
+ Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '91bb7506bd20ed22b8787e7a8b9975cc07e97175',
'src/third_party/freetype/src':
- Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'e9a7015ec876bc79a345af03bb756a0980345498',
+ Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '26e2a89598d69c7aba76c83f6a1fcf1db17574ab',
'src/third_party/googletest/src':
Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '4fe018038f87675c083d0cfb6a6b57c274fb1753',
'src/third_party/harfbuzz-ng/src':
@@ -79,7 +73,7 @@ deps = {
'src/third_party/yasm/source/patched-yasm':
Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
'src/tools':
- Var('chromium_git') + '/chromium/src/tools' + '@' + '72035b43be1620ac058061bfb6f58d3a51d3bcc3',
+ Var('chromium_git') + '/chromium/src/tools' + '@' + '1bb7c085e67a0fc8c63511af83299d1632f5a3f3',
'src/tools/swarming_client':
Var('chromium_git') + '/infra/luci/client-py.git' + '@' + 'd46ea7635f2911208268170512cb611412488fd8',
@@ -112,9 +106,9 @@ deps = {
'condition': 'checkout_android',
},
'src/third_party/boringssl/src':
- 'https://boringssl.googlesource.com/boringssl.git' + '@' + 'a673d02458b1b7d897084266b93d5c610e36bd17',
+ 'https://boringssl.googlesource.com/boringssl.git' + '@' + '1607f54fed72c6589d560254626909a64124f091',
'src/base': {
- 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e5c8a2271eca4665e90a3c209db9fbf478c9ecfb',
+ 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e096814b0448fba1095c6e7be7c7a0b5d7264251',
'condition': 'checkout_android',
},
'src/third_party/bazel': {
@@ -263,7 +257,7 @@ deps = {
},
'src/third_party/icu': {
- 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c7c91f829d1d5421be329536811d9336af09b27d',
+ 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c2a4cae149aae7fd30c4cbe3cf1b30df03b386f1',
},
'src/third_party/icu4j': {
'packages': [
@@ -370,7 +364,7 @@ deps = {
'dep_type': 'cipd',
},
'src/third_party/robolectric/robolectric': {
- 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + 'dc8c5f555f0f542dffc71b5fb80f52fe4ea946e2',
+ 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '2f3e0a3ac450a17dbf2e7d4eaab3a1f14dda50e6',
'condition': 'checkout_android',
},
'src/third_party/sqlite4java': {
@@ -414,7 +408,7 @@ deps = {
# iOS deps:
'src/ios': {
- 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '5900cb114e36aeff8db416b8436aa0e91632afa5',
+ 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '60ef55beac67e3c0eda1c35ab7944c786b377313',
'condition': 'checkout_ios'
},
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium
index 4ff4ea445d1..578228da455 100644
--- a/chromium/third_party/libyuv/README.chromium
+++ b/chromium/third_party/libyuv/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1767
+Version: 1770
License: BSD
License File: LICENSE
diff --git a/chromium/third_party/libyuv/include/libyuv.h b/chromium/third_party/libyuv/include/libyuv.h
index aeffd5ef7a4..a06e1233abb 100644
--- a/chromium/third_party/libyuv/include/libyuv.h
+++ b/chromium/third_party/libyuv/include/libyuv.h
@@ -26,6 +26,7 @@
#include "libyuv/scale.h"
#include "libyuv/scale_argb.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"
#include "libyuv/version.h"
#include "libyuv/video_common.h"
diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
index 9e0038f4745..8d868b95425 100644
--- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
@@ -200,6 +200,16 @@ int I444Copy(const uint8_t* src_y,
int width,
int height);
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+ int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_uv, int dst_stride_uv, int width, int height);
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+ int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_vu, int dst_stride_vu, int width, int height);
+
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h
index 95ecef89266..a386d499895 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale_row.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h
@@ -113,6 +113,7 @@ extern "C" {
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h
index 1d085960e39..3c632b3ab06 100644
--- a/chromium/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1767
+#define LIBYUV_VERSION 1770
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/chromium/third_party/libyuv/source/compare_gcc.cc b/chromium/third_party/libyuv/source/compare_gcc.cc
index 676527c1b1b..6700f9697e0 100644
--- a/chromium/third_party/libyuv/source/compare_gcc.cc
+++ b/chromium/third_party/libyuv/source/compare_gcc.cc
@@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
uint64_t diff = 0u;
asm volatile(
- "xor %3,%3 \n"
- "xor %%r8,%%r8 \n"
- "xor %%r9,%%r9 \n"
- "xor %%r10,%%r10 \n"
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
// Process 32 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%rcx \n"
- "mov 0x8(%0),%%rdx \n"
- "xor (%1),%%rcx \n"
- "xor 0x8(%1),%%rdx \n"
- "popcnt %%rcx,%%rcx \n"
- "popcnt %%rdx,%%rdx \n"
- "mov 0x10(%0),%%rsi \n"
- "mov 0x18(%0),%%rdi \n"
- "xor 0x10(%1),%%rsi \n"
- "xor 0x18(%1),%%rdi \n"
- "popcnt %%rsi,%%rsi \n"
- "popcnt %%rdi,%%rdi \n"
- "add $0x20,%0 \n"
- "add $0x20,%1 \n"
- "add %%rcx,%3 \n"
- "add %%rdx,%%r8 \n"
- "add %%rsi,%%r9 \n"
- "add %%rdi,%%r10 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "add %%r8, %3 \n"
- "add %%r9, %3 \n"
- "add %%r10, %3 \n"
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%ecx \n"
- "mov 0x4(%0),%%edx \n"
- "xor (%1),%%ecx \n"
- "xor 0x4(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "mov 0x8(%0),%%ecx \n"
- "mov 0xc(%0),%%edx \n"
- "xor 0x8(%1),%%ecx \n"
- "xor 0xc(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "add $0x10,%0 \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
uint32_t diff = 0u;
asm volatile(
- "movdqa %4,%%xmm2 \n"
- "movdqa %5,%%xmm3 \n"
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub %0,%1 \n"
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa 0x10(%0), %%xmm5 \n"
- "pxor (%0,%1), %%xmm4 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pand %%xmm2,%%xmm6 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm6,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "paddb %%xmm7,%%xmm6 \n"
- "pxor 0x10(%0,%1),%%xmm5 \n"
- "add $0x20,%0 \n"
- "movdqa %%xmm5,%%xmm4 \n"
- "pand %%xmm2,%%xmm5 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm5,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm5 \n"
- "pshufb %%xmm4,%%xmm5 \n"
- "paddb %%xmm7,%%xmm5 \n"
- "paddb %%xmm5,%%xmm6 \n"
- "psadbw %%xmm1,%%xmm6 \n"
- "paddd %%xmm6,%%xmm0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "pshufd $0xaa,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0, %3 \n"
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
- "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
- "sub %0,%1 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqa (%0),%%ymm4 \n"
- "vmovdqa 0x20(%0), %%ymm5 \n"
- "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
- "vpand %%ymm2,%%ymm4,%%ymm6 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
- "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
- "add $0x40,%0 \n"
- "vpand %%ymm2,%%ymm4,%%ymm5 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
- "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
- "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
- "vpermq $0xb1,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xaa,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovd %%xmm0, %3 \n"
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash;
asm volatile(
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "sub $0x10,%1 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc
index 2a2181e0cb3..afdd6012164 100644
--- a/chromium/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libyuv/source/compare_neon.cc
@@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t diff;
asm volatile(
- "vmov.u16 q4, #0 \n" // accumulator
+ "vmov.u16 q4, #0 \n" // accumulator
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n"
- "vld1.8 {q2, q3}, [%1]! \n"
- "veor.32 q0, q0, q2 \n"
- "veor.32 q1, q1, q3 \n"
- "vcnt.i8 q0, q0 \n"
- "vcnt.i8 q1, q1 \n"
- "subs %2, %2, #32 \n"
- "vadd.u8 q0, q0, q1 \n" // 16 byte counts
- "vpadal.u8 q4, q0 \n" // 8 shorts
- "bgt 1b \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
- "vpaddl.u16 q0, q4 \n" // 4 ints
- "vpadd.u32 d0, d0, d1 \n"
- "vpadd.u32 d0, d0, d0 \n"
- "vmov.32 %3, d0[0] \n"
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
@@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
"1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
diff --git a/chromium/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libyuv/source/compare_neon64.cc
index a22ba75b330..70fb9b9143f 100644
--- a/chromium/third_party/libyuv/source/compare_neon64.cc
+++ b/chromium/third_party/libyuv/source/compare_neon64.cc
@@ -27,24 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile(
- "movi v4.8h, #0 \n"
+ "movi v4.8h, #0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
- "eor v0.16b, v0.16b, v2.16b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "eor v1.16b, v1.16b, v3.16b \n"
- "cnt v0.16b, v0.16b \n"
- "prfm pldl1keep, [%1, 448] \n"
- "cnt v1.16b, v1.16b \n"
- "subs %w2, %w2, #32 \n"
- "add v0.16b, v0.16b, v1.16b \n"
- "uadalp v4.8h, v0.16b \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
- "uaddlv s4, v4.8h \n"
- "fmov %w3, s4 \n"
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
@@ -56,30 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "prfm pldl1keep, [%1, 448] \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc
index 936d7d34088..fe89452b772 100644
--- a/chromium/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libyuv/source/cpu_id.cc
@@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
+ "mov %%ebx, %%edi \n"
"cpuid \n"
- "xchg %%edi, %%ebx \n"
+ "xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc
index d5cd7e6808e..4e8908c2eba 100644
--- a/chromium/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libyuv/source/planar_functions.cc
@@ -349,6 +349,39 @@ int I420ToI400(const uint8_t* src_y,
return 0;
}
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+ int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_uv, int dst_stride_uv, int width, int height) {
+ if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+ halfheight);
+ return 0;
+}
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+ int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_vu, int dst_stride_vu, int width, int height) {
+ return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
// Support function for NV12 etc UV channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
diff --git a/chromium/third_party/libyuv/source/rotate_gcc.cc b/chromium/third_party/libyuv/source/rotate_gcc.cc
index 04e19e29eef..fd359d4ae69 100644
--- a/chromium/third_party/libyuv/source/rotate_gcc.cc
+++ b/chromium/third_party/libyuv/source/rotate_gcc.cc
@@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
// Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc
index fdc0dd476c6..844df2bf305 100644
--- a/chromium/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld1.8 {d0}, [%0], %2 \n"
- "vld1.8 {d1}, [%0], %2 \n"
- "vld1.8 {d2}, [%0], %2 \n"
- "vld1.8 {d3}, [%0], %2 \n"
- "vld1.8 {d4}, [%0], %2 \n"
- "vld1.8 {d5}, [%0], %2 \n"
- "vld1.8 {d6}, [%0], %2 \n"
- "vld1.8 {d7}, [%0] \n"
-
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
-
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
-
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d1}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d3}, [%0], %4 \n"
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d5}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d7}, [%0], %4 \n"
- "vst1.8 {d6}, [%0] \n"
-
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld2.8 {d0, d1}, [%0], %2 \n"
- "vld2.8 {d2, d3}, [%0], %2 \n"
- "vld2.8 {d4, d5}, [%0], %2 \n"
- "vld2.8 {d6, d7}, [%0], %2 \n"
- "vld2.8 {d16, d17}, [%0], %2 \n"
- "vld2.8 {d18, d19}, [%0], %2 \n"
- "vld2.8 {d20, d21}, [%0], %2 \n"
- "vld2.8 {d22, d23}, [%0] \n"
-
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
-
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
-
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d6}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d18}, [%0], %4 \n"
- "vst1.8 {d16}, [%0], %4 \n"
- "vst1.8 {d22}, [%0], %4 \n"
- "vst1.8 {d20}, [%0] \n"
-
- "mov %0, %5 \n"
-
- "vst1.8 {d3}, [%0], %6 \n"
- "vst1.8 {d1}, [%0], %6 \n"
- "vst1.8 {d7}, [%0], %6 \n"
- "vst1.8 {d5}, [%0], %6 \n"
- "vst1.8 {d19}, [%0], %6 \n"
- "vst1.8 {d17}, [%0], %6 \n"
- "vst1.8 {d23}, [%0], %6 \n"
- "vst1.8 {d21}, [%0] \n"
-
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
diff --git a/chromium/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libyuv/source/rotate_neon64.cc
index 99f7ee16ca0..43c1581731d 100644
--- a/chromium/third_party/libyuv/source/rotate_neon64.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon64.cc
@@ -34,74 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w3, %w3, #8 \n"
+ "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
- "ld1 {v0.8b}, [%0], %5 \n"
- "ld1 {v1.8b}, [%0], %5 \n"
- "ld1 {v2.8b}, [%0], %5 \n"
- "ld1 {v3.8b}, [%0], %5 \n"
- "ld1 {v4.8b}, [%0], %5 \n"
- "ld1 {v5.8b}, [%0], %5 \n"
- "ld1 {v6.8b}, [%0], %5 \n"
- "ld1 {v7.8b}, [%0] \n"
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
"mov %0, %1 \n"
- "trn2 v16.8b, v0.8b, v1.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "trn1 v17.8b, v0.8b, v1.8b \n"
- "add %0, %0, %5 \n"
- "trn2 v18.8b, v2.8b, v3.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // row 1
- "trn1 v19.8b, v2.8b, v3.8b \n"
- "add %0, %0, %5 \n"
- "trn2 v20.8b, v4.8b, v5.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // row 2
- "trn1 v21.8b, v4.8b, v5.8b \n"
- "add %0, %0, %5 \n"
- "trn2 v22.8b, v6.8b, v7.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // row 3
- "trn1 v23.8b, v6.8b, v7.8b \n"
- "add %0, %0, %5 \n"
-
- "trn2 v3.4h, v17.4h, v19.4h \n"
- "prfm pldl1keep, [%0, 448] \n" // row 4
- "trn1 v1.4h, v17.4h, v19.4h \n"
- "add %0, %0, %5 \n"
- "trn2 v2.4h, v16.4h, v18.4h \n"
- "prfm pldl1keep, [%0, 448] \n" // row 5
- "trn1 v0.4h, v16.4h, v18.4h \n"
- "add %0, %0, %5 \n"
- "trn2 v7.4h, v21.4h, v23.4h \n"
- "prfm pldl1keep, [%0, 448] \n" // row 6
- "trn1 v5.4h, v21.4h, v23.4h \n"
- "add %0, %0, %5 \n"
- "trn2 v6.4h, v20.4h, v22.4h \n"
- "prfm pldl1keep, [%0, 448] \n" // row 7
- "trn1 v4.4h, v20.4h, v22.4h \n"
-
- "trn2 v21.2s, v1.2s, v5.2s \n"
- "trn1 v17.2s, v1.2s, v5.2s \n"
- "trn2 v20.2s, v0.2s, v4.2s \n"
- "trn1 v16.2s, v0.2s, v4.2s \n"
- "trn2 v23.2s, v3.2s, v7.2s \n"
- "trn1 v19.2s, v3.2s, v7.2s \n"
- "trn2 v22.2s, v2.2s, v6.2s \n"
- "trn1 v18.2s, v2.2s, v6.2s \n"
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
- "st1 {v17.8b}, [%0], %6 \n"
- "st1 {v16.8b}, [%0], %6 \n"
- "st1 {v19.8b}, [%0], %6 \n"
- "st1 {v18.8b}, [%0], %6 \n"
- "st1 {v21.8b}, [%0], %6 \n"
- "st1 {v20.8b}, [%0], %6 \n"
- "st1 {v23.8b}, [%0], %6 \n"
- "st1 {v22.8b}, [%0] \n"
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
@@ -110,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w3, %w3, #8 \n"
- "b.eq 4f \n"
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w3, #2 \n"
- "b.lt 3f \n"
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- "cmp %w3, #4 \n"
- "b.lt 2f \n"
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
// 4x8 block
- "mov %0, %1 \n"
- "ld1 {v0.s}[0], [%0], %5 \n"
- "ld1 {v0.s}[1], [%0], %5 \n"
- "ld1 {v0.s}[2], [%0], %5 \n"
- "ld1 {v0.s}[3], [%0], %5 \n"
- "ld1 {v1.s}[0], [%0], %5 \n"
- "ld1 {v1.s}[1], [%0], %5 \n"
- "ld1 {v1.s}[2], [%0], %5 \n"
- "ld1 {v1.s}[3], [%0] \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- "ld1 {v2.16b}, [%4] \n"
+ "ld1 {v2.16b}, [%4] \n"
- "tbl v3.16b, {v0.16b}, v2.16b \n"
- "tbl v0.16b, {v1.16b}, v2.16b \n"
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
@@ -228,90 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w4, %w4, #8 \n"
+ "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "ld1 {v0.16b}, [%0], %5 \n"
- "ld1 {v1.16b}, [%0], %5 \n"
- "ld1 {v2.16b}, [%0], %5 \n"
- "ld1 {v3.16b}, [%0], %5 \n"
- "ld1 {v4.16b}, [%0], %5 \n"
- "ld1 {v5.16b}, [%0], %5 \n"
- "ld1 {v6.16b}, [%0], %5 \n"
- "ld1 {v7.16b}, [%0] \n"
- "mov %0, %1 \n"
-
- "trn1 v16.16b, v0.16b, v1.16b \n"
- "trn2 v17.16b, v0.16b, v1.16b \n"
- "trn1 v18.16b, v2.16b, v3.16b \n"
- "trn2 v19.16b, v2.16b, v3.16b \n"
- "trn1 v20.16b, v4.16b, v5.16b \n"
- "trn2 v21.16b, v4.16b, v5.16b \n"
- "trn1 v22.16b, v6.16b, v7.16b \n"
- "trn2 v23.16b, v6.16b, v7.16b \n"
-
- "trn1 v0.8h, v16.8h, v18.8h \n"
- "trn2 v1.8h, v16.8h, v18.8h \n"
- "trn1 v2.8h, v20.8h, v22.8h \n"
- "trn2 v3.8h, v20.8h, v22.8h \n"
- "trn1 v4.8h, v17.8h, v19.8h \n"
- "trn2 v5.8h, v17.8h, v19.8h \n"
- "trn1 v6.8h, v21.8h, v23.8h \n"
- "trn2 v7.8h, v21.8h, v23.8h \n"
-
- "trn1 v16.4s, v0.4s, v2.4s \n"
- "trn2 v17.4s, v0.4s, v2.4s \n"
- "trn1 v18.4s, v1.4s, v3.4s \n"
- "trn2 v19.4s, v1.4s, v3.4s \n"
- "trn1 v20.4s, v4.4s, v6.4s \n"
- "trn2 v21.4s, v4.4s, v6.4s \n"
- "trn1 v22.4s, v5.4s, v7.4s \n"
- "trn2 v23.4s, v5.4s, v7.4s \n"
+ "mov %0, %1 \n"
- "mov %0, %2 \n"
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
- "st1 {v16.d}[0], [%0], %6 \n"
- "st1 {v18.d}[0], [%0], %6 \n"
- "st1 {v17.d}[0], [%0], %6 \n"
- "st1 {v19.d}[0], [%0], %6 \n"
- "st1 {v16.d}[1], [%0], %6 \n"
- "st1 {v18.d}[1], [%0], %6 \n"
- "st1 {v17.d}[1], [%0], %6 \n"
- "st1 {v19.d}[1], [%0] \n"
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
- "mov %0, %3 \n"
+ "mov %0, %2 \n"
- "st1 {v20.d}[0], [%0], %7 \n"
- "st1 {v22.d}[0], [%0], %7 \n"
- "st1 {v21.d}[0], [%0], %7 \n"
- "st1 {v23.d}[0], [%0], %7 \n"
- "st1 {v20.d}[1], [%0], %7 \n"
- "st1 {v22.d}[1], [%0], %7 \n"
- "st1 {v21.d}[1], [%0], %7 \n"
- "st1 {v23.d}[1], [%0] \n"
-
- "add %1, %1, #16 \n" // src += 8*2
- "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
- "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
- "subs %w4, %w4, #8 \n" // w -= 8
- "b.ge 1b \n"
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w4, %w4, #8 \n"
- "b.eq 4f \n"
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w4, #2 \n"
- "b.lt 3f \n"
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- "cmp %w4, #4 \n"
- "b.lt 2f \n"
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc
index c4c012ff412..a107c30e769 100644
--- a/chromium/third_party/libyuv/source/row_gcc.cc
+++ b/chromium/third_party/libyuv/source/row_gcc.cc
@@ -159,24 +159,24 @@ static const lvec8 kShuffleNV21 = {
#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -190,35 +190,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -228,35 +228,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -267,35 +267,35 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
// Same code as RAWToARGB with different shuffler and A in low bits
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
- "psrld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
+ "psrld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
@@ -307,25 +307,25 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
uint8_t* dst_rgb24,
int width) {
asm volatile(
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x4(%0),%%xmm1 \n"
- "movdqu 0x8(%0),%%xmm2 \n"
- "lea 0x18(%0),%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -337,44 +337,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -385,47 +385,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -436,34 +436,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,0x00(%1,%0,2) \n"
- "movdqu %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -474,35 +474,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -513,35 +513,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -556,37 +556,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
+ "vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -615,26 +615,26 @@ static const ulvec8 kPermARGBToRGB24_2 = {
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vmovdqa %3,%%ymm5 \n"
- "vmovdqa %4,%%ymm6 \n"
- "vmovdqa %5,%%ymm7 \n"
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
- "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
- "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -650,37 +650,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
+ "vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -694,34 +694,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -734,40 +734,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
const uint32_t dither4,
int width) {
asm volatile(
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -783,35 +783,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
int width) {
asm volatile(
"vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -824,38 +824,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -865,26 +865,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -928,31 +928,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028;
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -967,31 +967,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -1008,25 +1008,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1045,25 +1045,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1150,9 +1150,9 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_argb), // %0
@@ -1171,8 +1171,8 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
// Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN RGBTOY(xmm5)
: "+r"(src_argb), // %0
@@ -1189,8 +1189,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
// Same as ARGBToYRow but different coefficients, no add 16.
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN RGBTOY(xmm5)
: "+r"(src_rgba), // %0
@@ -1212,7 +1212,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
- "vmovdqu %6,%%ymm6 \n"
+ "vmovdqu %6,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_argb), // %0
@@ -1234,7 +1234,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
- "vmovdqu %6,%%ymm6 \n"
+ "vmovdqu %6,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_abgr), // %0
@@ -1255,7 +1255,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+ "vmovdqu %5,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(ymm5)
: "+r"(src_argb), // %0
@@ -1275,7 +1275,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+ "vmovdqu %5,%%ymm6 \n"
LABELALIGN RGBTOY_AVX2(
ymm5) "vzeroupper \n"
@@ -1296,52 +1296,52 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1368,44 +1368,44 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1431,44 +1431,44 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
@@ -1494,45 +1494,45 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1555,53 +1555,53 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1620,47 +1620,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %4,%%xmm3 \n"
- "movdqa %5,%%xmm4 \n"
- "movdqa %6,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqu %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1674,9 +1674,9 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_bgra), // %0
@@ -1695,52 +1695,52 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1754,9 +1754,9 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_abgr), // %0
@@ -1771,9 +1771,9 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- "movdqa %5,%%xmm7 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
LABELALIGN RGBTOY(xmm7)
: "+r"(src_rgba), // %0
@@ -1792,52 +1792,52 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1855,52 +1855,52 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -2117,16 +2117,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2146,27 +2146,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- "sub %[u_buf],%[v_buf] \n"
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2192,16 +2192,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2221,21 +2221,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2256,16 +2256,16 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2286,21 +2286,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2323,15 +2323,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422
YUVTORGB(yuvconstants)
STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2358,15 +2358,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2386,15 +2386,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2414,15 +2414,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2442,15 +2442,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2471,16 +2471,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STORERGBA
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2695,17 +2695,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2729,18 +2729,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2764,23 +2764,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2804,18 +2804,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2839,23 +2839,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2881,16 +2881,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2920,11 +2920,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
@@ -2964,16 +2964,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2997,16 +2997,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -3030,16 +3030,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -3063,16 +3063,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -3092,10 +3092,10 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
- "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
- "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
- "pslld $0x18,%%xmm4 \n"
+ "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
+ "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
+ "pslld $0x18,%%xmm4 \n"
LABELALIGN
"1: \n"
@@ -3137,10 +3137,10 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
- "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
- "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
- "vpslld $0x18,%%ymm4,%%ymm4 \n"
+ "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
+ "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
@@ -3182,16 +3182,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %3,%%xmm5 \n"
+ "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu -0x10(%0,%2,1),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3209,13 +3209,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3234,16 +3234,16 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %3,%%xmm5 \n"
+ "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu -0x10(%0,%2,2),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(temp_width) // %2
@@ -3261,13 +3261,13 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
LABELALIGN
"1: \n"
- "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
@@ -3287,20 +3287,20 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %4,%%xmm1 \n"
- "lea -0x10(%0,%3,2),%0 \n"
- "sub %1,%2 \n"
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $8,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -3327,27 +3327,27 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
intptr_t temp_width = (intptr_t)(width);
src_rgb24 += width * 3 - 48;
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n" // first 5
- "movdqu 15(%0),%%xmm1 \n" // next 5
- "movdqu 30(%0),%%xmm2 \n" // next 5
- "movdqu 32(%0),%%xmm3 \n" // last 1 special
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm3 \n"
- "lea -0x30(%0),%0 \n"
- "movdqu %%xmm0,32(%1) \n" // last 5
- "movdqu %%xmm1,17(%1) \n" // next 5
- "movdqu %%xmm2,2(%1) \n" // next 5
- "movlpd %%xmm3,0(%1) \n" // first 1
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n" // first 5
+ "movdqu 15(%0),%%xmm1 \n" // next 5
+ "movdqu 30(%0),%%xmm2 \n" // next 5
+ "movdqu 32(%0),%%xmm3 \n" // last 1 special
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "lea -0x30(%0),%0 \n"
+ "movdqu %%xmm0,32(%1) \n" // last 5
+ "movdqu %%xmm1,17(%1) \n" // next 5
+ "movdqu %%xmm2,2(%1) \n" // next 5
+ "movlpd %%xmm3,0(%1) \n" // first 1
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_rgb24), // %1
"+r"(temp_width) // %2
@@ -3363,17 +3363,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "lea -0x10(%0,%2,4),%0 \n"
+ "lea -0x10(%0,%2,4),%0 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufd $0x1b,%%xmm0,%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3389,15 +3389,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "vmovdqu %3,%%ymm5 \n"
+ "vmovdqu %3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3413,28 +3413,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@@ -3451,28 +3451,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm2,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -3489,22 +3489,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@@ -3522,21 +3522,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3559,30 +3559,30 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
int width) {
// clang-format off
asm volatile (
- "vmovd %4,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu (%0,%1,1),%%ymm1 \n"
- "add $0x20,%0 \n"
-
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
- "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "add $0x40,%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
+
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3605,24 +3605,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%0,%1) \n"
- "vmovdqu %%ymm1,0x20(%0,%1) \n"
- "add $0x40,%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3643,23 +3643,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "add $0x20,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3675,25 +3675,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "add $0x40,%0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3714,25 +3714,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "add $0x10,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3748,26 +3748,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "add $0x40,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3819,41 +3819,41 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "lea 0x30(%0),%0 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -3914,42 +3914,42 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,16(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,32(%3) \n"
-
- "lea 0x10(%0),%0 \n"
- "lea 0x10(%1),%1 \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x30(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -3971,35 +3971,35 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 9f \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
LABELALIGN
"2: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 2b \n"
-
- LABELALIGN "9: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN "9: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4014,14 +4014,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4036,7 +4036,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep movsb \n"
+ "rep movsb \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
@@ -4049,29 +4049,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4084,21 +4084,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "vmovdqu 0x20(%0),%%ymm2 \n"
- "lea 0x40(%0),%0 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -4117,17 +4117,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0), %%xmm0 \n"
- "movdqu 0x10(%0), %%xmm1 \n"
- "lea 0x20(%0), %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1), %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+rm"(width) // %2
@@ -4145,28 +4145,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile(
- "vmovdqa %3,%%ymm4 \n"
+ "vmovdqa %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0), %%ymm0 \n"
- "vmovdqu 0x20(%0), %%ymm1 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x40(%0), %%ymm2 \n"
- "vmovdqu 0x60(%0), %%ymm3 \n"
- "lea 0x80(%0), %0 \n"
- "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20, %2 \n"
- "jg 1b \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
@@ -4181,31 +4181,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
// width in pixels
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm2 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4218,23 +4218,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vpmovzxbd (%0),%%ymm1 \n"
- "vpmovzxbd 0x8(%0),%%ymm2 \n"
- "lea 0x10(%0),%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -4250,7 +4250,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4261,7 +4261,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosb \n"
+ "rep stosb \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v8) // %2
@@ -4272,7 +4272,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst_argb), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4283,21 +4283,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
#ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4311,32 +4311,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4350,28 +4350,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4385,16 +4385,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4408,32 +4408,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4447,28 +4447,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4481,22 +4481,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
#ifdef HAS_YUY2TOYROW_AVX2
void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@@ -4511,32 +4511,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4551,30 +4551,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4589,17 +4589,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -4613,32 +4613,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4653,30 +4653,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4698,71 +4698,71 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
// 1 pixel loop.
"91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 91b \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4786,36 +4786,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(alpha), // %2
@@ -4838,43 +4838,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
"vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
"vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 32 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src0), // %0
"+r"(src1), // %1
@@ -4898,35 +4898,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -4947,29 +4947,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpslld $0x18,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm6,%%ymm6 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpor %%ymm6,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4989,32 +4989,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movzb 0x03(%0),%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x07(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
@@ -5034,52 +5034,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
int width) {
uintptr_t alpha;
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
"vbroadcastf128 %5,%%ymm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
// replace VPGATHER
- "movzb 0x03(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x07(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "movzb 0x13(%0),%3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x17(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x1b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x1f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
"vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
"vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
"vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
// end of VPGATHER
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
- "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -5096,42 +5096,42 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psubb %%xmm5,%%xmm0 \n"
- "psubb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm4,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "movdqu %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm6 \n"
- "paddw %%xmm5,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm6,%%xmm3 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm6,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "movdqu %%xmm6,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psubb %%xmm5,%%xmm0 \n"
+ "psubb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm4,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "movdqu %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm6 \n"
+ "paddw %%xmm5,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm6,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5158,50 +5158,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "m"(kARGBToSepiaB), // %2
@@ -5219,54 +5219,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm6,0x10(%1) \n"
- "lea 0x20(%0),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5284,40 +5284,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqu (%0),%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqu %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x4,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -5335,27 +5335,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5372,28 +5372,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
int width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5411,26 +5411,26 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
int width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu (%1),%%ymm3 \n"
- "lea 0x20(%1),%1 \n"
- "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5456,15 +5456,15 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5484,14 +5484,14 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpaddusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5512,15 +5512,15 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -5540,14 +5540,14 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpsubusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -5569,40 +5569,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobelx,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x2(%0),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "movq 0x02(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x00(%0,%2,1),%%xmm2 \n"
- "movq 0x02(%0,%2,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%3,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -5623,39 +5623,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobely,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x1(%0),%%xmm1 \n"
- "movq 0x01(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x2(%0),%%xmm2 \n"
- "movq 0x02(%0,%1,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%2,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -5676,37 +5676,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqu %%xmm1,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "movdqu %%xmm3,0x20(%2) \n"
- "movdqu %%xmm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5723,21 +5723,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_y,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -5758,36 +5758,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm6,(%2) \n"
- "movdqu %%xmm4,0x10(%2) \n"
- "movdqu %%xmm7,0x20(%2) \n"
- "movdqu %%xmm1,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5806,67 +5806,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
const int32_t* previous_cumsum,
int width) {
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqu 0x10(%2),%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqu 0x20(%2),%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqu 0x30(%2),%%xmm5 \n"
- "lea 0x40(%2),%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "movdqu %%xmm4,0x20(%1) \n"
- "movdqu %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop.
LABELALIGN
"10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "lea 0x10(%2),%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(row), // %0
@@ -5886,119 +5886,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
uint8_t* dst,
int count) {
asm volatile(
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
-
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
// 4 pixel small loop.
LABELALIGN
"4: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "movdqu (%0),%%xmm0 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(topleft), // %0
"+r"(botleft), // %1
@@ -6021,70 +6021,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
asm volatile(
- "movq (%3),%%xmm2 \n"
- "movq 0x08(%3),%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x04(%2),%2 \n"
- "sub $0x1,%4 \n"
- "jge 10b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
"+r"(src_argb_stride_temp), // %1
@@ -6106,68 +6106,68 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm2,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 100b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -6187,61 +6187,61 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "sub %1,%0 \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "vmovd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "vmovd %3,%%xmm5 \n"
- "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
- "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
"vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
"vbroadcastss %%xmm4,%%ymm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "rep movsb \n"
- "jmp 999f \n"
+ "rep movsb \n"
+ "jmp 999f \n"
"99: \n"
"vzeroupper \n"
@@ -6263,20 +6263,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6297,16 +6297,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -6324,24 +6324,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "add $0x10,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6360,24 +6360,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "add $0x10,%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,(%3) \n"
- "movdqu %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6396,26 +6396,26 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -6435,26 +6435,26 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
- "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -6473,47 +6473,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
int width) {
asm volatile(
- "pxor %%xmm3,%%xmm3 \n"
+ "pxor %%xmm3,%%xmm3 \n"
// 2 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps 0x10(%3),%%xmm0 \n"
- "mulps 0x10(%3),%%xmm4 \n"
- "addps (%3),%%xmm0 \n"
- "addps (%3),%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps 0x20(%3),%%xmm2 \n"
- "mulps 0x20(%3),%%xmm6 \n"
- "mulps 0x30(%3),%%xmm1 \n"
- "mulps 0x30(%3),%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6609,27 +6609,27 @@ void HalfFloatRow_AVX2(const uint16_t* src,
int width) {
scale *= kScaleBias;
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm2 \n" // 16 shorts
- "add $0x20,%0 \n"
- "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
- "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vpsrld $0xd,%%ymm3,%%ymm3 \n"
- "vpsrld $0xd,%%ymm2,%%ymm2 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
- "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -6650,8 +6650,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
float scale,
int width) {
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
@@ -6685,7 +6685,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
@@ -6719,21 +6719,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "movzb -0x1(%0),%1 \n"
- "movzb 0x03(%3,%1,4),%1 \n"
- "mov %b1,-0x1(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6752,18 +6752,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6782,86 +6782,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
uintptr_t pixel_temp;
uintptr_t table_temp;
asm volatile(
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%2),%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb (%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,(%3) \n"
- "movzb 0x1(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x1(%3) \n"
- "movzb 0x2(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x2(%3) \n"
- "movzb 0x3(%2),%0 \n"
- "mov %b0,0x3(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x4(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x4(%3) \n"
- "movzb 0x5(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x5(%3) \n"
- "movzb 0x6(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x6(%3) \n"
- "movzb 0x7(%2),%0 \n"
- "mov %b0,0x7(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x8(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x8(%3) \n"
- "movzb 0x9(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x9(%3) \n"
- "movzb 0xa(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xa(%3) \n"
- "movzb 0xb(%2),%0 \n"
- "mov %b0,0xb(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
-
- "movzb 0xc(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xc(%3) \n"
- "movzb 0xd(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xd(%3) \n"
- "movzb 0xe(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xe(%3) \n"
- "movzb 0xf(%2),%0 \n"
- "mov %b0,0xf(%3) \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x10(%3),%3 \n"
- "sub $0x4,%4 \n"
- "jg 1b \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
: "=&d"(pixel_temp), // %0
"=&a"(table_temp), // %1
"+r"(src_argb), // %2
@@ -6934,46 +6934,47 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
src_y_ptr = (uint8_t*)src_y;
asm volatile(
- "vmovdqu %5, %%ymm0 \n" // init blend value
- "vmovdqu %6, %%ymm1 \n" // init blend value
- "vmovdqu %7, %%ymm2 \n" // init blend value
- // "sub $0x20, %3 \n" //sub 32 from width for final loop
+ "vmovdqu %5, %%ymm0 \n" // init blend value
+ "vmovdqu %6, %%ymm1 \n" // init blend value
+ "vmovdqu %7, %%ymm2 \n" // init blend value
+ // "sub $0x20, %3 \n" //sub 32 from
+ // width for final loop
LABELALIGN
- "1: \n" // label 1
- "vmovdqu (%0,%4), %%ymm3 \n" // src_y
- "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
- "vmovdqu (%1), %%ymm5 \n" // src_uv
- "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
- "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
- // shuf
- "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
- // shuf
- "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
- "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
- // shuf
- "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
- "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
- "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
- "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
- "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
- "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
- "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
- "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
- "add $0x20, %4 \n" // add to src buffer
- // ptr
- "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
- "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
- "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
- "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
- "add $0x60,%2 \n" // add to dst buffer
- // ptr
- // "cmp %3, %4 \n" //(width64 -
+ "1: \n" // label 1
+ "vmovdqu (%0,%4), %%ymm3 \n" // src_y
+ "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
+ "vmovdqu (%1), %%ymm5 \n" // src_uv
+ "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
+ "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
+ // shuf
+ "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
+ // shuf
+ "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
+ "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
+ // shuf
+ "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
+ "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
+ "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
+ "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
+ "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
+ "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
+ "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
+ "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
+ "add $0x20, %4 \n" // add to src buffer
+ // ptr
+ "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
+ "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
+ "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
+ "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
+ "add $0x60,%2 \n" // add to dst buffer
+ // ptr
+ // "cmp %3, %4 \n" //(width64 -
// 32 bytes) and src_offset
- "sub $0x20,%3 \n" // 32 pixels per loop
- "jg 1b \n"
- "vzeroupper \n" // sse-avx2
- // transistions
+ "sub $0x20,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n" // sse-avx2
+ // transistions
: "+r"(src_y), //%0
"+r"(src_vu), //%1
@@ -7004,20 +7005,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
- "movdqu %3,%%xmm5 \n"
+ "movdqu %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
@@ -7034,16 +7035,16 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
@@ -7060,36 +7061,36 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
uint8_t* dst_uv,
int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n" // load 16 U values
- "movdqu (%1),%%xmm1 \n" // load 16 V values
- "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
- "movdqu 0(%1,%5,1),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n" // half size
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x10(%1),%1 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
- "lea 0x10(%2),%2 \n"
- "sub $0x10,%3 \n" // 16 src pixels per loop
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -7113,29 +7114,29 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n" // load 32 U values
- "vmovdqu (%1),%%ymm1 \n" // load 32 V values
- "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
- "vmovdqu 0(%1,%5,1),%%ymm3 \n"
- "lea 0x20(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x20(%1),%1 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%3 \n" // 32 src pixels per loop
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@@ -7148,17 +7149,17 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
asm volatile(
- "pxor %%xmm1,%%xmm1 \n"
+ "pxor %%xmm1,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movd (%0),%%xmm0 \n" // load float
- "maxss %%xmm1, %%xmm0 \n" // clamp to zero
- "add 4, %0 \n"
- "movd %%xmm0, (%1) \n" // store float
- "add 4, %1 \n"
- "sub $0x4,%2 \n" // 1 float per loop
- "jg 1b \n"
+ "movd (%0),%%xmm0 \n" // load float
+ "maxss %%xmm1, %%xmm0 \n" // clamp to zero
+ "add 4, %0 \n"
+ "movd %%xmm0, (%1) \n" // store float
+ "add 4, %1 \n"
+ "sub $0x4,%2 \n" // 1 float per loop
+ "jg 1b \n"
: "+r"(src_x), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc
index b81c53ff2bd..a5aeaabfbd7 100644
--- a/chromium/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libyuv/source/row_neon.cc
@@ -114,11 +114,11 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV444 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -140,11 +140,11 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -168,10 +168,10 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %5, %5, #8 \n"
- "vld1.8 {d23}, [%3]! \n"
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
- "bgt 1b \n"
+ "subs %5, %5, #8 \n"
+ "vld1.8 {d23}, [%3]! \n"
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -195,10 +195,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -221,9 +221,9 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -253,9 +253,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "subs %4, %4, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -287,10 +287,10 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB1555
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -321,14 +321,14 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d4, #0x0f \n" // vbic bits to clear
+ "vmov.u8 d4, #0x0f \n" // vbic bits to clear
"1: \n"
READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB4444
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -348,11 +348,11 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUV400 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -366,14 +366,14 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n"
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -387,11 +387,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@@ -410,11 +410,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@@ -439,9 +439,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
"1: \n"
READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb24), // %2
@@ -466,9 +466,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
"1: \n"
READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_rgb24), // %2
@@ -489,9 +489,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "subs %3, %3, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -509,11 +509,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -530,11 +530,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -553,11 +553,11 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store U
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -574,11 +574,11 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load U
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -596,13 +596,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
- "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store R
- "vst1.8 {q1}, [%2]! \n" // store G
- "vst1.8 {q2}, [%3]! \n" // store B
- "bgt 1b \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -621,13 +621,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load R
- "vld1.8 {q1}, [%1]! \n" // load G
- "vld1.8 {q2}, [%2]! \n" // load B
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
- "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -642,10 +642,10 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
@@ -657,11 +657,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -671,11 +671,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
- "subs %1, %1, #4 \n" // 4 pixels per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
@@ -685,18 +685,18 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %2 \n"
- "sub %0, %0, #32 \n" // 32 bytes per loop
-
- "1: \n"
- "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
- "subs %2, #32 \n" // 32 pixels per loop.
- "vrev64.8 q0, q2 \n"
- "vrev64.8 q1, q1 \n"
- "vswp d0, d1 \n"
- "vswp d2, d3 \n"
- "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
- "bgt 1b \n"
+ "add %0, %0, %2 \n"
+ "sub %0, %0, #32 \n" // 32 bytes per loop
+
+ "1: \n"
+ "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
+ "subs %2, #32 \n" // 32 pixels per loop.
+ "vrev64.8 q0, q2 \n"
+ "vrev64.8 q1, q1 \n"
+ "vswp d0, d1 \n"
+ "vswp d2, d3 \n"
+ "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -707,16 +707,16 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %2, lsl #1 \n"
- "sub %0, #16 \n"
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
"1: \n"
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %2, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
- "bgt 1b \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
@@ -730,17 +730,17 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
// Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
"1: \n"
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -751,18 +751,18 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #32 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #32 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
- "subs %2, #8 \n" // 8 pixels per loop.
- "vrev64.8 d0, d0 \n"
- "vrev64.8 d1, d1 \n"
- "vrev64.8 d2, d2 \n"
- "vrev64.8 d3, d3 \n"
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vrev64.8 d3, d3 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -776,13 +776,13 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
src_rgb24 += width * 3 - 24;
asm volatile(
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
- "subs %2, #8 \n" // 8 pixels per loop.
- "vrev64.8 d0, d0 \n"
- "vrev64.8 d1, d1 \n"
- "vrev64.8 d2, d2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
- "bgt 1b \n"
+ "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -794,12 +794,12 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -810,13 +810,13 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -827,13 +827,13 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
- "vmov.u8 d0, #255 \n" // Alpha
+ "vmov.u8 d0, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
@@ -844,12 +844,12 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -874,13 +874,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -920,13 +920,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -949,13 +949,13 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -969,11 +969,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -985,11 +985,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
@@ -1001,10 +1001,10 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1016,10 +1016,10 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1034,11 +1034,11 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1054,11 +1054,11 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1074,16 +1074,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(dst_u), // %2
@@ -1101,16 +1101,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
"+r"(dst_u), // %2
@@ -1128,14 +1128,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1151,12 +1151,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1173,12 +1173,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1193,11 +1193,11 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
@@ -1210,16 +1210,16 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "vdup.32 d2, %2 \n" // dither4
+ "vdup.32 d2, %2 \n" // dither4
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d20, d20, d2 \n"
- "vqadd.u8 d21, d21, d2 \n"
- "vqadd.u8 d22, d22, d2 \n" // add for dither
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n" // add for dither
ARGBTORGB565
- "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
@@ -1232,11 +1232,11 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
@@ -1248,14 +1248,14 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "vmov.u8 d4, #0x0f \n" // bits to clear with
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
@@ -1265,20 +1265,20 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1291,11 +1291,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
@@ -1306,18 +1306,18 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1327,18 +1327,18 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d1, d24 \n" // B
- "vmlal.u8 q2, d2, d25 \n" // G
- "vmlal.u8 q2, d3, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d1, d24 \n" // B
+ "vmlal.u8 q2, d2, d25 \n" // G
+ "vmlal.u8 q2, d3, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1352,32 +1352,32 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
// coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
-
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1409,34 +1409,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1455,34 +1455,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1500,34 +1500,34 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q3, q2, q1)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2
@@ -1545,34 +1545,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
@@ -1590,34 +1590,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
@@ -1635,34 +1635,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2
@@ -1680,34 +1680,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1
"+r"(dst_u), // %2
@@ -1726,55 +1726,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_stride_rgb565), // %1
"+r"(dst_u), // %2
@@ -1792,55 +1792,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_stride_argb1555), // %1
"+r"(dst_u), // %2
@@ -1858,46 +1858,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q0, q4, #1 \n" // 2x average
- "vrshr.u16 q1, q5, #1 \n"
- "vrshr.u16 q2, q6, #1 \n"
+ "vrshr.u16 q0, q4, #1 \n" // 2x average
+ "vrshr.u16 q1, q5, #1 \n"
+ "vrshr.u16 q2, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_stride_argb4444), // %1
"+r"(dst_u), // %2
@@ -1910,21 +1910,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1936,21 +1936,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1962,21 +1962,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1986,20 +1986,20 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2009,20 +2009,20 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2032,20 +2032,20 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2055,20 +2055,20 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2078,20 +2078,20 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
- "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2101,18 +2101,18 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
asm volatile(
- "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q4, d0, d4 \n" // B
- "vmlal.u8 q4, d1, d5 \n" // G
- "vmlal.u8 q4, d2, d6 \n" // R
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
"vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_yj), // %1
"+r"(width) // %2
@@ -2122,18 +2122,18 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
asm volatile(
- "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q4, d0, d4 \n" // B
- "vmlal.u8 q4, d1, d5 \n" // G
- "vmlal.u8 q4, d2, d6 \n" // R
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
"vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_yj), // %1
"+r"(width) // %2
@@ -2149,46 +2149,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2206,51 +2206,51 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %3, #8 \n"
- "blt 89f \n"
+ "subs %3, #8 \n"
+ "blt 89f \n"
// Blend 8 pixels.
"8: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
"89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
// Blend 1 pixels.
"1: \n"
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
-
- "99: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -2267,16 +2267,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2292,32 +2292,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
"vqdmulh.s16 q0, q0, q8 \n" // b * scale
"vqdmulh.s16 q1, q1, q8 \n" // g
"vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2334,28 +2334,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
"vqrdmulh.s16 q11, q11, d0[1] \n" // g
"vqrdmulh.s16 q12, q12, d0[2] \n" // r
"vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2368,20 +2368,20 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
- "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
- "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2395,32 +2395,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2436,51 +2436,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
-
- "1: \n"
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q11, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2497,19 +2497,19 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2526,13 +2526,13 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2549,13 +2549,13 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2574,17 +2574,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2601,12 +2601,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2625,15 +2625,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2653,23 +2653,23 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%5 \n" // top
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2691,23 +2691,23 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%4 \n" // left
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%0],%5 \n" // right
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2729,18 +2729,18 @@ void HalfFloat1Row_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2755,18 +2755,18 @@ void HalfFloatRow_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2781,17 +2781,17 @@ void ByteToFloatRow_NEON(const uint8_t* src,
asm volatile(
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 bytes
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u8 q1, d2 \n" // 8 shorts
- "vmovl.u16 q2, d2 \n" // 8 ints
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // scale
- "vmul.f32 q3, q3, %y3 \n"
- "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2808,26 +2808,26 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "vmov.u16 d6, #4 \n" // constant 4
- "vmov.u16 d7, #6 \n" // constant 6
-
- "1: \n"
- "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
- "vld1.16 {q2}, [%4]! \n"
- "vaddl.u16 q0, d2, d4 \n" // * 1
- "vaddl.u16 q1, d3, d5 \n" // * 1
- "vld1.16 {q2}, [%1]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "vld1.16 {q2}, [%2]! \n"
- "vmlal.u16 q0, d4, d7 \n" // * 6
- "vmlal.u16 q1, d5, d7 \n" // * 6
- "vld1.16 {q2}, [%3]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "subs %6, %6, #8 \n" // 8 processed per loop
- "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
- "bgt 1b \n"
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2845,8 +2845,8 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "vmov.u32 q10, #4 \n" // constant 4
- "vmov.u32 q11, #6 \n" // constant 6
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
"1: \n"
"vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
@@ -2884,16 +2884,16 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q2}, [%0]! \n" // load 16 Y values
- "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
- "vmov d1, d0 \n"
- "vzip.u8 d0, d1 \n" // VV
- "vmov d3, d2 \n"
- "vzip.u8 d2, d3 \n" // UU
- "subs %3, %3, #16 \n" // 16 pixels per loop
- "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
- "vst3.8 {d1, d3, d5}, [%2]! \n"
- "bgt 1b \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
@@ -2907,24 +2907,24 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
uint8_t* dst_uv,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d0, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
- "bgt 1b \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_uv), // %2
@@ -2938,24 +2938,24 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
uint8_t* dst_vu,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d1, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
- "bgt 1b \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_vu), // %2
@@ -2969,11 +2969,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2985,12 +2985,12 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
- "vld2.8 {d1, d3}, [%0]! \n"
- "vorr.u8 q2, q0, q0 \n" // move U after V
- "subs %2, %2, #16 \n" // 16 pixels per loop
- "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
@@ -3008,19 +3008,19 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v_1 = src_v + src_stride_v;
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 16 U values
- "vld1.8 {q1}, [%2]! \n" // load 16 V values
- "vld1.8 {q2}, [%1]! \n"
- "vld1.8 {q3}, [%3]! \n"
- "vpaddl.u8 q0, q0 \n" // half size
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q1, q3 \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
"vqrshrn.u16 d0, q0, #2 \n"
"vqrshrn.u16 d1, q1, #2 \n"
- "subs %5, %5, #16 \n" // 16 src pixels per loop
- "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
- "bgt 1b \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_u_1), // %1
"+r"(src_v), // %2
diff --git a/chromium/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libyuv/source/row_neon64.cc
index 06e9eea993c..d5258a3aef3 100644
--- a/chromium/third_party/libyuv/source/row_neon64.cc
+++ b/chromium/third_party/libyuv/source/row_neon64.cc
@@ -114,16 +114,16 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
READYUV444
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 448] \n"
- "prfm pldl1keep, [%2, 448] \n"
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -146,17 +146,17 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ "movi v23.8b, #255 \n" /* A */
- "1: \n"
+ "1: \n"
READYUV422
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -180,17 +180,17 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "ld1 {v23.8b}, [%3], #8 \n"
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "prfm pldl1keep, [%3, 448] \n"
- "subs %w5, %w5, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v23.8b}, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -214,16 +214,16 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
- "1: \n"
+ "movi v20.8b, #255 \n" /* A */
+ "1: \n"
READYUV422
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v23, v22, v21)
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -246,15 +246,15 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "subs %w4, %w4, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -286,16 +286,16 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w4, %w4, #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
ARGBTORGB565
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -326,17 +326,17 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w4, %w4, #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
ARGBTOARGB1555
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -369,18 +369,18 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v4.16b, #0x0f \n" // bits to clear with vbic.
- "1: \n"
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w4, %w4, #8 \n"
- "movi v23.8b, #255 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB4444
- "prfm pldl1keep, [%1, 128] \n"
- "prfm pldl1keep, [%2, 128] \n"
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -401,14 +401,14 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUV400
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -423,15 +423,15 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v23.8b, #255 \n"
+ "movi v23.8b, #255 \n"
"1: \n"
- "ld1 {v20.8b}, [%0], #8 \n"
- "prfm pldl1keep, [%0, 448] \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -446,15 +446,15 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READNV12
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 256] \n"
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@@ -475,15 +475,15 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READNV21
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 256] \n"
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@@ -504,14 +504,14 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READNV12
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 256] \n"
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb24), // %2
@@ -532,14 +532,14 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile (
YUVTORGB_SETUP
- "1: \n"
+ "1: \n"
READNV21
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%1, 256] \n"
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_rgb24), // %2
@@ -560,12 +560,12 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP "1: \n" READNV12
- "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
+ "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
v22, v21, v20) ARGBTORGB565
- "prfm pldl1keep, [%1, 256] \n"
- "subs %w3, %w3, #8 \n"
- "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
- "b.gt 1b \n"
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@@ -584,14 +584,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READYUY2
- "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -610,14 +610,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile (
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
+ "movi v23.8b, #255 \n"
+ "1: \n"
READUYVY
YUVTORGB(v22, v21, v20)
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
- "b.gt 1b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -637,12 +637,12 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store U
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -659,13 +659,13 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -683,13 +683,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store R
- "st1 {v1.16b}, [%2], #16 \n" // store G
- "st1 {v2.16b}, [%3], #16 \n" // store B
- "b.gt 1b \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -708,16 +708,16 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load R
- "ld1 {v1.16b}, [%1], #16 \n" // load G
- "ld1 {v2.16b}, [%2], #16 \n" // load B
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "prfm pldl1keep, [%2, 448] \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
- "prfm pldl1keep, [%0, 448] \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -732,11 +732,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "ldp q0, q1, [%0], #32 \n"
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #32 \n" // 32 processed per loop
- "stp q0, q1, [%1], #32 \n"
- "b.gt 1b \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
@@ -748,11 +748,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -761,11 +761,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
@@ -779,17 +779,17 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "ld1 {v3.16b}, [%3] \n" // shuffler
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #32 \n"
- "1: \n"
- "ldr q2, [%0, 16] \n"
- "ldr q1, [%0], -32 \n" // src -= 32
- "subs %w2, %w2, #32 \n" // 32 pixels per loop.
- "tbl v0.16b, {v2.16b}, v3.16b \n"
- "tbl v1.16b, {v1.16b}, v3.16b \n"
- "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
- "b.gt 1b \n"
+ "ld1 {v3.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #32 \n" // 32 pixels per loop.
+ "tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -804,17 +804,17 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
- "ld1 {v4.16b}, [%3] \n" // shuffler
- "add %0, %0, %w2, sxtw #1 \n"
- "sub %0, %0, #32 \n"
- "1: \n"
- "ldr q1, [%0, 16] \n"
- "ldr q0, [%0], -32 \n" // src -= 32
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "tbl v2.16b, {v1.16b}, v4.16b \n"
- "tbl v3.16b, {v0.16b}, v4.16b \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
- "b.gt 1b \n"
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
@@ -828,20 +828,20 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
// Start at end of source row.
- "ld1 {v4.16b}, [%4] \n" // shuffler
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #32 \n"
- "1: \n"
- "ldr q1, [%0, 16] \n"
- "ldr q0, [%0], -32 \n" // src -= 32
- "subs %w3, %w3, #16 \n" // 16 pixels per loop.
- "tbl v2.16b, {v1.16b}, v4.16b \n"
- "tbl v3.16b, {v0.16b}, v4.16b \n"
- "uzp1 v0.16b, v2.16b, v3.16b \n" // U
- "uzp2 v1.16b, v2.16b, v3.16b \n" // V
- "st1 {v0.16b}, [%1], #16 \n" // dst += 16
- "st1 {v1.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v4.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -857,17 +857,17 @@ static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
// Start at end of source row.
- "ld1 {v4.16b}, [%3] \n" // shuffler
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #32 \n"
- "1: \n"
- "ldr q1, [%0, 16] \n"
- "ldr q0, [%0], -32 \n" // src -= 32
- "subs %w2, %w2, #8 \n" // 8 pixels per loop.
- "tbl v2.16b, {v1.16b}, v4.16b \n"
- "tbl v3.16b, {v0.16b}, v4.16b \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
- "b.gt 1b \n"
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -879,19 +879,19 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width) {
asm volatile(
- "ld1 {v3.16b}, [%4] \n" // shuffler
- "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #48 \n"
+ "ld1 {v3.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #48 \n"
"1: \n"
- "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3\n" // src -= 48
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "tbl v0.16b, {v0.16b}, v3.16b \n"
- "tbl v1.16b, {v1.16b}, v3.16b \n"
- "tbl v2.16b, {v2.16b}, v3.16b \n"
- "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
- "b.gt 1b \n"
+ "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v0.16b, {v0.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v2.16b}, v3.16b \n"
+ "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -904,13 +904,14 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v4.8b, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
+ // RGB24.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -921,15 +922,15 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v5.8b, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -940,15 +941,15 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
- "movi v0.8b, #255 \n" // Alpha
+ "movi v0.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v2.8b, v4.8b, v4.8b \n" // move g
- "orr v1.8b, v5.8b, v5.8b \n" // move r
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
- "b.gt 1b \n"
+ "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v2.8b, v4.8b, v4.8b \n" // move g
+ "orr v1.8b, v5.8b, v5.8b \n" // move r
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
@@ -960,13 +961,13 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -992,14 +993,14 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1049,14 +1050,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1084,12 +1085,12 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1103,11 +1104,12 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
+ // RGB24
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -1119,13 +1121,13 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v4.8b, v2.8b, v2.8b \n" // mov g
- "orr v5.8b, v1.8b, v1.8b \n" // mov b
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
@@ -1137,11 +1139,11 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1153,11 +1155,11 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1172,12 +1174,12 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1193,12 +1195,12 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1216,15 +1218,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
@@ -1244,15 +1246,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
@@ -1270,14 +1272,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1293,14 +1295,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "prfm pldl1keep, [%0, 448] \n"
- "orr v2.8b, v1.8b, v1.8b \n"
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1317,14 +1319,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "prfm pldl1keep, [%0, 448] \n"
- "orr v3.8b, v2.8b, v2.8b \n"
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1339,12 +1341,13 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
@@ -1357,16 +1360,17 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "dup v1.4s, %w2 \n" // dither4
- "1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
- "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "dup v1.4s, %w2 \n" // dither4
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
@@ -1379,12 +1383,13 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
@@ -1396,15 +1401,16 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "movi v4.16b, #0x0f \n" // bits to clear with
+ "movi v4.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- "b.gt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
@@ -1414,21 +1420,21 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1441,11 +1447,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
@@ -1456,19 +1462,19 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #29 \n" // B * 0.1140 coefficient
- "movi v5.8b, #150 \n" // G * 0.5870 coefficient
- "movi v6.8b, #77 \n" // R * 0.2990 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1478,19 +1484,19 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #29 \n" // B * 0.1140 coefficient
- "movi v5.8b, #150 \n" // G * 0.5870 coefficient
- "movi v6.8b, #77 \n" // R * 0.2990 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v0.8h, v1.8b, v4.8b \n" // B
- "umlal v0.8h, v2.8b, v5.8b \n" // G
- "umlal v0.8h, v3.8b, v6.8b \n" // R
- "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
- "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v1.8b, v4.8b \n" // B
+ "umlal v0.8h, v2.8b, v5.8b \n" // G
+ "umlal v0.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1504,33 +1510,33 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movi v24.8b, #112 \n" // UB / VR 0.875
+ "movi v24.8b, #112 \n" // UB / VR 0.875
// coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
-
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
-
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
-
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1574,28 +1580,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
-
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1614,33 +1620,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
- "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
- "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
- "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
- "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
- "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
- "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1660,27 +1666,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
@@ -1700,27 +1706,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v3.8h, #1 \n" // 2x average
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
@@ -1740,27 +1746,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
@@ -1780,27 +1786,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
@@ -1820,27 +1826,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
const uint8_t* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v2.8h, v2.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v0.8h, v0.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
@@ -1862,43 +1868,43 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%1, 448] \n"
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
RGB565TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v0.8h, v16.8h, #1 \n" // 2x average
- "urshr v1.8h, v17.8h, #1 \n"
- "urshr v2.8h, v18.8h, #1 \n"
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_rgb565_1), // %1
"+r"(dst_u), // %2
@@ -1920,43 +1926,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%1, 448] \n"
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
RGB555TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v0.8h, v16.8h, #1 \n" // 2x average
- "urshr v1.8h, v17.8h, #1 \n"
- "urshr v2.8h, v18.8h, #1 \n"
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
@@ -1978,43 +1984,43 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
asm volatile(
RGBTOUV_SETUP_REG // sets v20-v25
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%1, 448] \n"
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v0.8h, v16.8h, #1 \n" // 2x average
- "urshr v1.8h, v17.8h, #1 \n"
- "urshr v2.8h, v18.8h, #1 \n"
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
@@ -2030,22 +2036,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "movi v24.8b, #25 \n" // B * 0.1016 coefficient
- "movi v25.8b, #129 \n" // G * 0.5078 coefficient
- "movi v26.8b, #66 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2058,22 +2064,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2085,22 +2091,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v24.8b, #25 \n" // B * 0.1016 coefficient
- "movi v25.8b, #129 \n" // G * 0.5078 coefficient
- "movi v26.8b, #66 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2110,21 +2116,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #66 \n" // R * 0.2578 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #25 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2134,21 +2140,21 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movi v6.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v4.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2158,21 +2164,21 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2182,21 +2188,21 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile(
- "movi v4.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v6.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2206,21 +2212,21 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile(
- "movi v6.8b, #25 \n" // B * 0.1016 coefficient
- "movi v5.8b, #129 \n" // G * 0.5078 coefficient
- "movi v4.8b, #66 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2230,19 +2236,19 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
asm volatile(
- "movi v4.8b, #29 \n" // B * 0.1140 coefficient
- "movi v5.8b, #150 \n" // G * 0.5870 coefficient
- "movi v6.8b, #77 \n" // R * 0.2990 coefficient
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // B
- "umlal v0.8h, v1.8b, v5.8b \n" // G
- "umlal v0.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_yj), // %1
"+r"(width) // %2
@@ -2252,19 +2258,19 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
asm volatile(
- "movi v6.8b, #29 \n" // B * 0.1140 coefficient
- "movi v5.8b, #150 \n" // G * 0.5870 coefficient
- "movi v4.8b, #77 \n" // R * 0.2990 coefficient
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // B
- "umlal v0.8h, v1.8b, v5.8b \n" // G
- "umlal v0.8h, v2.8b, v6.8b \n" // R
- "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "movi v6.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v4.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_yj), // %1
"+r"(width) // %2
@@ -2282,49 +2288,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "prfm pldl1keep, [%1, 448] \n"
- "prfm pldl1keep, [%2, 448] \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "prfm pldl1keep, [%1, 448] \n"
- "prfm pldl1keep, [%2, 448] \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2343,58 +2349,60 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- // pixels
- "b.ge 8b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
"89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "b.ge 1b \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
+ // ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
+ // ARGB1.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
"99: \n"
@@ -2414,17 +2422,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2440,33 +2448,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2483,29 +2491,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2518,21 +2526,21 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v24.8b, #29 \n" // B * 0.1140 coefficient
- "movi v25.8b, #150 \n" // G * 0.5870 coefficient
- "movi v26.8b, #77 \n" // R * 0.2990 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
- "orr v1.8b, v0.8b, v0.8b \n" // G
- "orr v2.8b, v0.8b, v0.8b \n" // R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "movi v24.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v25.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v26.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2547,33 +2555,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2589,52 +2597,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
-
- "1: \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2652,21 +2660,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2683,17 +2691,17 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2710,17 +2718,17 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@@ -2739,19 +2747,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2768,14 +2776,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2794,17 +2802,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2824,26 +2832,26 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%5 \n" // top
- "ld1 {v1.8b}, [%0],%6 \n"
- "prfm pldl1keep, [%0, 448] \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- "ld1 {v3.8b}, [%1],%6 \n"
- "prfm pldl1keep, [%1, 448] \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- "ld1 {v3.8b}, [%2],%6 \n"
- "prfm pldl1keep, [%2, 448] \n"
- "subs %w4, %w4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2865,25 +2873,25 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%4 \n" // left
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%0],%5 \n" // right
- "ld1 {v3.8b}, [%1],%5 \n"
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2901,17 +2909,17 @@ void HalfFloat1Row_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n" // 8 half floats
- "fcvtn2 v1.8h, v3.4s \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2925,19 +2933,19 @@ void HalfFloatRow_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2951,18 +2959,18 @@ void ByteToFloatRow_NEON(const uint8_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v1.8h, v1.8b \n" // 8 shorts
- "uxtl v2.4s, v1.4h \n" // 8 ints
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
- "b.gt 1b \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2976,21 +2984,21 @@ float ScaleMaxSamples_NEON(const float* src,
int width) {
float fmax;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n"
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
- "fmax v5.4s, v5.4s, v1.4s \n" // max
- "fmax v6.4s, v6.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "fmax v5.4s, v5.4s, v6.4s \n" // max
- "fmaxv %s3, v5.4s \n" // signed max acculator
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -3006,22 +3014,22 @@ float ScaleSumSamples_NEON(const float* src,
int width) {
float fsum;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n" // max
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
- "fmla v6.4s, v2.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "faddp v5.4s, v5.4s, v6.4s \n"
- "faddp v5.4s, v5.4s, v5.4s \n"
- "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -3034,13 +3042,13 @@ float ScaleSumSamples_NEON(const float* src,
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3057,31 +3065,31 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "movi v6.8h, #4 \n" // constant 4
- "movi v7.8h, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
- "ld1 {v2.8h}, [%4], #16 \n"
- "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
- "prfm pldl1keep, [%0, 448] \n"
- "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
- "ld1 {v2.8h}, [%1], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "prfm pldl1keep, [%1, 448] \n"
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "ld1 {v2.8h}, [%2], #16 \n"
- "umlal v0.4s, v2.4h, v7.4h \n" // * 6
- "prfm pldl1keep, [%2, 448] \n"
- "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
- "ld1 {v2.8h}, [%3], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "prfm pldl1keep, [%3, 448] \n"
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "subs %w6, %w6, #8 \n" // 8 processed per loop
- "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
- "prfm pldl1keep, [%4, 448] \n"
- "b.gt 1b \n"
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
+ "b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -3099,28 +3107,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "movi v6.4s, #4 \n" // constant 4
- "movi v7.4s, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
- "add v0.4s, v0.4s, v1.4s \n" // * 1
- "add v1.4s, v1.4s, v2.4s \n" // * 1
- "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
- "mla v0.4s, v2.4s, v7.4s \n" // * 6
- "mla v1.4s, v3.4s, v7.4s \n" // * 6
- "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
- "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
- "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
- "add v3.4s, v3.4s, v5.4s \n"
- "prfm pldl1keep, [%0, 448] \n"
- "mla v0.4s, v2.4s, v6.4s \n" // * 4
- "mla v1.4s, v3.4s, v6.4s \n" // * 4
- "subs %w5, %w5, #8 \n" // 8 processed per loop
- "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
- "uqrshrn2 v0.8h, v1.4s, #8 \n"
- "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
- "b.gt 1b \n"
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -3142,30 +3150,30 @@ void GaussCol_F32_NEON(const float* src0,
float* dst,
int width) {
asm volatile(
- "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
-
- "1: \n"
- "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
- "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
- "fmla v0.4s, v2.4s, v6.4s \n" // * 4
- "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
- "fmla v1.4s, v3.4s, v6.4s \n"
- "prfm pldl1keep, [%0, 448] \n"
- "fmla v0.4s, v4.4s, v7.4s \n" // * 6
- "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
- "fmla v1.4s, v5.4s, v7.4s \n"
- "prfm pldl1keep, [%1, 448] \n"
- "fmla v0.4s, v2.4s, v6.4s \n" // * 4
- "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
- "fmla v1.4s, v3.4s, v6.4s \n"
- "prfm pldl1keep, [%2, 448] \n"
- "fadd v0.4s, v0.4s, v4.4s \n" // * 1
- "prfm pldl1keep, [%3, 448] \n"
- "fadd v1.4s, v1.4s, v5.4s \n"
- "prfm pldl1keep, [%4, 448] \n"
- "subs %w6, %w6, #8 \n" // 8 processed per loop
- "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
+ "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
+ "fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -3180,27 +3188,28 @@ void GaussCol_F32_NEON(const float* src0,
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_F32_NEON(const float* src, float* dst, int width) {
asm volatile(
- "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
-
- "1: \n"
- "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4\n" // load 12 samples, 5 rows
- "fadd v0.4s, v0.4s, v1.4s \n" // * 1
- "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
- "fadd v1.4s, v1.4s, v2.4s \n"
- "fmla v0.4s, v4.4s, v7.4s \n" // * 6
- "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
- "fmla v1.4s, v5.4s, v7.4s \n"
- "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
- "fadd v2.4s, v2.4s, v4.4s \n"
- "fadd v3.4s, v3.4s, v5.4s \n"
- "fmla v0.4s, v2.4s, v6.4s \n" // * 4
- "fmla v1.4s, v3.4s, v6.4s \n"
- "prfm pldl1keep, [%0, 448] \n"
- "fmul v0.4s, v0.4s, v8.4s \n" // / 256
- "fmul v1.4s, v1.4s, v8.4s \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
+ // rows
+ "fadd v0.4s, v0.4s, v1.4s \n" // * 1
+ "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
+ "fadd v1.4s, v1.4s, v2.4s \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
+ "fadd v2.4s, v2.4s, v4.4s \n"
+ "fadd v3.4s, v3.4s, v5.4s \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v0.4s, v0.4s, v8.4s \n" // / 256
+ "fmul v1.4s, v1.4s, v8.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3218,15 +3227,15 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
- "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
- "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
- "subs %w3, %w3, #16 \n" // 16 pixels per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
- "b.gt 1b \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
@@ -3243,19 +3252,19 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v2.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_uv), // %2
@@ -3272,19 +3281,19 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v1.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_vu), // %2
@@ -3297,11 +3306,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3316,16 +3325,16 @@ static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
// Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
- "ld1 {v1.16b}, [%0], 16 \n"
- "prfm pldl1keep, [%0, 448] \n"
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "tbl v0.16b, {v0.16b}, v2.16b \n"
- "tbl v1.16b, {v1.16b}, v2.16b \n"
- "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
@@ -3343,23 +3352,23 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v_1 = src_v + src_stride_v;
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
- "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
- "ld1 {v2.16b}, [%1], #16 \n"
- "ld1 {v3.16b}, [%3], #16 \n"
- "uaddlp v0.8h, v0.16b \n" // half size
- "prfm pldl1keep, [%0, 448] \n"
- "uaddlp v1.8h, v1.16b \n"
- "prfm pldl1keep, [%2, 448] \n"
- "uadalp v0.8h, v2.16b \n"
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v1.8h, v3.16b \n"
- "prfm pldl1keep, [%3, 448] \n"
- "uqrshrn v0.8b, v0.8h, #2 \n"
- "uqrshrn v1.8b, v1.8h, #2 \n"
- "subs %w5, %w5, #16 \n" // 16 src pixels per loop
- "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_u_1), // %1
"+r"(src_v), // %2
diff --git a/chromium/third_party/libyuv/source/scale_any.cc b/chromium/third_party/libyuv/source/scale_any.cc
index b571aec964f..c93d70c5fc7 100644
--- a/chromium/third_party/libyuv/source/scale_any.cc
+++ b/chromium/third_party/libyuv/source/scale_any.cc
@@ -490,6 +490,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
4,
1)
#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+ ScaleUVRowDownEven_NEON,
+ ScaleUVRowDownEven_C,
+ 2,
+ 3)
+#endif
#ifdef SASIMDONLY
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc
index fd4cbd03867..81959925c8a 100644
--- a/chromium/third_party/libyuv/source/scale_common.cc
+++ b/chromium/third_party/libyuv/source/scale_common.cc
@@ -1412,8 +1412,8 @@ enum FilterMode ScaleFilterReduce(int src_width,
src_height = -src_height;
}
if (filtering == kFilterBox) {
- // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
- if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
filtering = kFilterBilinear;
}
}
diff --git a/chromium/third_party/libyuv/source/scale_gcc.cc b/chromium/third_party/libyuv/source/scale_gcc.cc
index ef59c7e95ef..e575ee18bcb 100644
--- a/chromium/third_party/libyuv/source/scale_gcc.cc
+++ b/chromium/third_party/libyuv/source/scale_gcc.cc
@@ -102,16 +102,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -125,25 +125,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -156,33 +156,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -200,17 +200,17 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -225,26 +225,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -258,34 +258,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -301,24 +301,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -332,46 +332,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
- "lea 0x00(%4,%4,2),%3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%4,2),%%xmm2 \n"
- "movdqu 0x10(%0,%4,2),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -387,26 +387,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -420,46 +420,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -476,9 +476,9 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
@@ -488,20 +488,20 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -514,18 +514,18 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
@@ -535,37 +535,37 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -580,18 +580,18 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
@@ -602,40 +602,40 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -651,23 +651,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -681,10 +681,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
@@ -695,25 +695,25 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -726,10 +726,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
@@ -739,44 +739,44 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqu 0x00(%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -791,25 +791,25 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
int src_width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n" // src_ptr += 16
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -824,22 +824,22 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
int src_width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm3 \n"
- "lea 0x20(%0),%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw (%1),%%ymm2,%%ymm0 \n"
- "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -866,69 +866,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile(
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
-
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movzwl 0x00(%1,%4,1),%k2 \n"
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
// 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
- "mov %w2,(%0) \n"
- "lea 0x2(%0),%0 \n"
- "subl $0x2,%5 \n"
- "jge 2b \n"
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
- "mov %b2,(%0) \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -966,16 +966,16 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -993,14 +993,14 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1017,17 +1017,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1043,21 +1043,21 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -1076,23 +1076,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
LABELALIGN
"1: \n"
- "movd (%0),%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%0,%1,2),%%xmm2 \n"
- "movd 0x00(%0,%4,1),%%xmm3 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1113,32 +1113,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
- "lea 0x00(%0,%5,1),%5 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps 0x00(%0,%1,1),%%xmm0 \n"
- "movq 0x00(%0,%1,2),%%xmm1 \n"
- "movhps 0x00(%0,%4,1),%%xmm1 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps 0x00(%5,%1,1),%%xmm2 \n"
- "movq 0x00(%5,%1,2),%%xmm3 \n"
- "movhps 0x00(%5,%4,1),%%xmm3 \n"
- "lea 0x00(%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1156,56 +1156,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
LABELALIGN
"40: \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%3,%0,4),%%xmm1 \n"
- "movd 0x00(%3,%1,4),%%xmm4 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%2) \n"
- "lea 0x8(%2),%2 \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
"29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
"99: \n"
: "=&a"(x0), // %0
"=&d"(x1), // %1
@@ -1230,16 +1230,16 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1267,63 +1267,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movhps 0x00(%1,%4,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
-
- LABELALIGN "99: \n" // clang-format error.
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+
+ LABELALIGN
+ "99: \n" // clang-format error.
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1339,10 +1340,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
@@ -1353,13 +1354,13 @@ int FixedDiv_X86(int num, int div) {
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
@@ -1379,30 +1380,30 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5, %%xmm5 \n" // zero
- "movdqa %4,%%xmm1 \n" // split shuffler
- "movdqa %5,%%xmm3 \n" // merge shuffler
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5, %%xmm5 \n" // zero
+ "movdqa %4,%%xmm1 \n" // split shuffler
+ "movdqa %5,%%xmm3 \n" // merge shuffler
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n" // 8 UV row 0
- "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
- "lea 0x10(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
- "pshufb %%xmm1,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n" // vertical add
- "psrlw $0x1,%%xmm0 \n" // round
- "pavgw %%xmm5,%%xmm0 \n"
- "pshufb %%xmm3,%%xmm0 \n" // merge uv
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n" // 4 UV
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n" // 8 UV row 0
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
+ "pshufb %%xmm1,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n" // vertical add
+ "psrlw $0x1,%%xmm0 \n" // round
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n" // merge uv
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n" // 4 UV
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1419,31 +1420,31 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
"vbroadcastf128 %4,%%ymm1 \n" // split shuffler
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
- "lea 0x20(%0),%0 \n"
- "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
- "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n" // 8 UV
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
+ "lea 0x20(%0),%0 \n"
+ "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
+ "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n" // 8 UV
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc
index b626fc2987f..572b4bfa9b3 100644
--- a/chromium/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libyuv/source/scale_neon.cc
@@ -31,10 +31,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into q0, odd into q1
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -51,11 +51,11 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -71,21 +71,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0 \n"
+ "add %1, %0 \n"
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
// row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [%3]! \n"
- "vld1.8 {q2}, [%4]! \n"
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vld1.8 {q3}, [%3] \n"
+ "vld1.8 {q3}, [%3] \n"
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.8 {d4}, [%1]! \n"
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile(
- "vld1.16 {q13}, [%5] \n"
- "vld1.8 {q14}, [%6] \n"
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
+ "vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
+ "vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -511,13 +511,13 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width) {
asm volatile(
"1: \n"
- "vld1.16 {q1, q2}, [%1] \n" // load accumulator
- "vld1.8 {q0}, [%0]! \n" // load 16 bytes
- "vaddw.u8 q2, q2, d1 \n" // add
- "vaddw.u8 q1, q1, d0 \n"
- "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
- "subs %2, %2, #16 \n" // 16 processed per loop
- "bgt 1b \n"
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -547,17 +547,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_ptr;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q3, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "vadd.s32 q2, q1, q3 \n"
- "vshl.i32 q0, q3, #1 \n" // 8 * dx
- "1: \n"
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -566,27 +566,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "vmov q10, q1 \n"
- "vmov q11, q2 \n"
- "vuzp.16 q10, q11 \n"
- "vmovl.u8 q8, d6 \n"
- "vmovl.u8 q9, d7 \n"
- "vsubl.s16 q11, d18, d16 \n"
- "vsubl.s16 q12, d19, d17 \n"
- "vmovl.u16 q13, d20 \n"
- "vmovl.u16 q10, d21 \n"
- "vmul.s32 q11, q11, q13 \n"
- "vmul.s32 q12, q12, q10 \n"
- "vrshrn.s32 d18, q11, #16 \n"
- "vrshrn.s32 d19, q12, #16 \n"
- "vadd.s16 q8, q8, q9 \n"
- "vmovn.s16 d6, q8 \n"
-
- "vst1.8 {d6}, [%0]! \n" // store pixels
- "vadd.s32 q1, q1, q0 \n"
- "vadd.s32 q2, q2, q0 \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -609,75 +609,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "vld1.8 {q1}, [%1]! \n"
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
- "vst1.8 {d1[7]}, [%0] \n"
+ "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -694,12 +694,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vmov q2, q1 \n" // load next 8 ARGB
- "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -722,13 +722,13 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vrhadd.u8 q1, q2, q3 \n" // rounding half add
- "vst2.32 {q0, q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -743,27 +743,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -781,15 +781,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
- "mov r12, %3, lsl #2 \n"
+ "mov r12, %3, lsl #2 \n"
"1: \n"
- "vld1.32 {d0[0]}, [%0], r12 \n"
- "vld1.32 {d0[1]}, [%0], r12 \n"
- "vld1.32 {d1[0]}, [%0], r12 \n"
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -805,30 +805,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
- "vld1.8 {d1}, [%1], r12 \n"
- "vld1.8 {d2}, [%0], r12 \n"
- "vld1.8 {d3}, [%1], r12 \n"
- "vld1.8 {d4}, [%0], r12 \n"
- "vld1.8 {d5}, [%1], r12 \n"
- "vld1.8 {d6}, [%0], r12 \n"
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -865,8 +865,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -897,16 +897,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_argb;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q9, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
- "vmov.i8 q3, #0x7f \n" // 0x7F
- "vmov.i16 q15, #0x7f \n" // 0x7F
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q8, q1, q0 \n"
- "1: \n"
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
@@ -951,26 +951,26 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
#undef LOAD2_DATA32_LANE
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst,
- int dst_width) {
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
- "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
- "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
- "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
- "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
- "vrshrn.u16 d1, q1, #2 \n"
- "vst2.8 {d0, d1}, [%2]! \n"
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
+ "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
+ "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
+ "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst2.8 {d0, d1}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -979,6 +979,35 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
: "memory", "cc", "q0", "q1", "q8", "q9");
}
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.16 {d0[0]}, [%0], %6 \n"
+ "vld1.16 {d0[1]}, [%1], %6 \n"
+ "vld1.16 {d0[2]}, [%2], %6 \n"
+ "vld1.16 {d0[3]}, [%3], %6 \n"
+ "subs %5, %5, #4 \n" // 4 pixels per loop.
+ "vst1.8 {d0}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"(src_stepx * 8) // %6
+ : "memory", "cc", "d0");
+}
+
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libyuv/source/scale_neon64.cc
index c45b7abecea..185591cb55b 100644
--- a/chromium/third_party/libyuv/source/scale_neon64.cc
+++ b/chromium/third_party/libyuv/source/scale_neon64.cc
@@ -29,11 +29,11 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -51,12 +51,12 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -72,21 +72,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "uaddlp v1.8h, v1.16b \n"
- "prfm pldl1keep, [%1, 448] \n"
- "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -103,11 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st1 {v2.8b}, [%1], #8 \n"
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -124,23 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- "ld1 {v1.16b}, [%2], #16 \n"
- "ld1 {v2.16b}, [%3], #16 \n"
- "ld1 {v3.16b}, [%4], #16 \n"
- "subs %w5, %w5, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "uadalp v0.8h, v1.16b \n"
- "prfm pldl1keep, [%2, 448] \n"
- "uadalp v0.8h, v2.16b \n"
- "prfm pldl1keep, [%3, 448] \n"
- "uadalp v0.8h, v3.16b \n"
- "prfm pldl1keep, [%4, 448] \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- "st1 {v0.s}[0], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uadalp v0.8h, v3.16b \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_ptr1), // %2
@@ -160,13 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #24 \n"
- "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -179,51 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
// 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
- "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -238,35 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
- "prfm pldl1keep, [%3, 448] \n"
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -293,15 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st1 {v2.8b}, [%1], #8 \n"
- "st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -318,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v29.8h}, [%5] \n"
- "ld1 {v30.16b}, [%6] \n"
- "ld1 {v31.8h}, [%7] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
- "subs %w4, %w4, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
// combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -387,38 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
+ "add v0.8h, v0.8h, v16.8h \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
- "prfm pldl1keep, [%2, 448] \n"
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
- "prfm pldl1keep, [%3, 448] \n"
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -440,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v30.8h}, [%4] \n"
- "ld1 {v31.16b}, [%5] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "subs %w3, %w3, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
// combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
+ "uqrshrn v2.8b, v2.8h, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -496,35 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
- "prfm pldl1keep, [%2, 448] \n"
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -542,14 +542,14 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width) {
asm volatile(
"1: \n"
- "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
- "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "uaddw v1.8h, v1.8h, v0.8b \n"
- "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -581,17 +581,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v3.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v1.4s, v1.4s, v0.4s \n"
+ "add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "add v2.4s, v1.4s, v3.4s \n"
- "shl v0.4s, v3.4s, #1 \n" // 8 * dx
- "1: \n"
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -600,27 +600,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "mov v6.16b, v1.16b \n"
- "mov v7.16b, v2.16b \n"
- "uzp1 v6.8h, v6.8h, v7.8h \n"
- "ushll v4.8h, v4.8b, #0 \n"
- "ushll v5.8h, v5.8b, #0 \n"
- "ssubl v16.4s, v5.4h, v4.4h \n"
- "ssubl2 v17.4s, v5.8h, v4.8h \n"
- "ushll v7.4s, v6.4h, #0 \n"
- "ushll2 v6.4s, v6.8h, #0 \n"
- "mul v16.4s, v16.4s, v7.4s \n"
- "mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
- "add v4.8h, v4.8h, v6.8h \n"
- "xtn v4.8b, v4.8h \n"
-
- "st1 {v4.8b}, [%0], #8 \n" // store pixels
- "add v1.4s, v1.4s, v0.4s \n"
- "add v2.4s, v2.4s, v0.4s \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -644,83 +644,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %w4, #64 \n"
- "b.eq 75f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "cmp %w4, #192 \n"
- "b.eq 25f \n"
-
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
+
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "prfm pldl1keep, [%2, 448] \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "prfm pldl1keep, [%2, 448] \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 25b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "prfm pldl1keep, [%2, 448] \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "prfm pldl1keep, [%2, 448] \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 75b \n"
- "b 99f \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
- "st1 {v0.b}[15], [%0] \n"
+ "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -739,12 +739,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "mov v2.16b, v3.16b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -761,14 +761,14 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
-
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "urhadd v1.16b, v2.16b, v3.16b \n"
- "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -783,27 +783,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "prfm pldl1keep, [%1, 448] \n"
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -822,14 +822,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "ld1 {v0.s}[0], [%0], %3 \n"
- "ld1 {v0.s}[1], [%0], %3 \n"
- "ld1 {v0.s}[2], [%0], %3 \n"
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -847,35 +847,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
- "ld1 {v1.8b}, [%1], %4 \n"
- "ld1 {v2.8b}, [%0], %4 \n"
- "ld1 {v3.8b}, [%1], %4 \n"
- "ld1 {v4.8b}, [%0], %4 \n"
- "ld1 {v5.8b}, [%1], %4 \n"
- "ld1 {v6.8b}, [%0], %4 \n"
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "prfm pldl1keep, [%1, 448] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %w3, %w3, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -912,11 +912,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
- "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
// clang-format on
- "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -949,16 +949,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v6.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
- "movi v3.16b, #0x7f \n" // 0x7F
- "movi v4.8h, #0x7f \n" // 0x7F
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v5.4s, v1.4s, v0.4s \n"
- "1: \n"
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
@@ -1010,21 +1010,21 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n"
- "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #8 \n" // 8 processed per loop
- "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
- "uaddlp v1.4s, v1.8h \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
- "uadalp v1.4s, v3.8h \n"
- "prfm pldl1keep, [%1, 448] \n"
- "rshrn v0.4h, v0.4s, #2 \n" // round and pack
- "rshrn2 v0.8h, v1.4s, #2 \n"
- "st1 {v0.8h}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1041,40 +1041,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
uint16_t* dst,
int dst_width) {
asm volatile(
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
- "movi v0.8h, #9 \n" // constants
- "movi v1.4s, #3 \n"
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
"1: \n"
- "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
- "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
- "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
- "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
- "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
- "umull v16.4s, v3.4h, v0.4h \n"
- "umull2 v7.4s, v3.8h, v0.8h \n"
- "umull v18.4s, v4.4h, v0.4h \n"
- "umull2 v17.4s, v4.8h, v0.8h \n"
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "uaddw v16.4s, v16.4s, v6.4h \n"
- "uaddl2 v19.4s, v6.8h, v3.8h \n"
- "uaddl v3.4s, v6.4h, v3.4h \n"
- "uaddw2 v6.4s, v7.4s, v6.8h \n"
- "uaddl2 v7.4s, v5.8h, v4.8h \n"
- "uaddl v4.4s, v5.4h, v4.4h \n"
- "uaddw v18.4s, v18.4s, v5.4h \n"
- "prfm pldl1keep, [%1, 448] \n"
- "mla v16.4s, v4.4s, v1.4s \n"
- "mla v18.4s, v3.4s, v1.4s \n"
- "mla v6.4s, v7.4s, v1.4s \n"
- "uaddw2 v4.4s, v17.4s, v5.8h \n"
- "uqrshrn v16.4h, v16.4s, #4 \n"
- "mla v4.4s, v19.4s, v1.4s \n"
- "uqrshrn2 v16.8h, v6.4s, #4 \n"
- "uqrshrn v17.4h, v18.4s, #4 \n"
- "uqrshrn2 v17.8h, v4.4s, #4 \n"
- "st2 {v16.8h-v17.8h}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1092,21 +1092,21 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
- "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
- "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "prfm pldl1keep, [%1, 448] \n"
- "rshrn v1.8b, v1.8h, #2 \n"
- "st2 {v0.8b,v1.8b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
+ "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
+ "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1115,6 +1115,35 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
: "memory", "cc", "v0", "v1", "v16", "v17");
}
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"((int64_t)(src_stepx * 8)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_uv.cc b/chromium/third_party/libyuv/source/scale_uv.cc
index 4e276518aaf..c57df5959b9 100644
--- a/chromium/third_party/libyuv/source/scale_uv.cc
+++ b/chromium/third_party/libyuv/source/scale_uv.cc
@@ -299,6 +299,14 @@ static void ScaleUVDownEven(int src_width,
}
#endif
#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif// TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
: ScaleUVRowDownEven_Any_NEON;
@@ -484,7 +492,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(dst_width, 4)) {
+ if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
@@ -492,7 +500,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(dst_width, 8)) {
+ if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
@@ -500,7 +508,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(dst_width, 4)) {
+ if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_NEON;
}
}
@@ -508,7 +516,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_INTERPOLATEROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(dst_width, 2)) {
+ if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_MMI;
}
}
@@ -516,7 +524,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
- if (IS_ALIGNED(dst_width, 8)) {
+ if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_MSA;
}
}
@@ -532,7 +540,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_SCALEUVFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
- if (IS_ALIGNED(dst_width, 4)) {
+ if (IS_ALIGNED(dst_width, 8)) {
ScaleUVFilterCols = ScaleUVFilterCols_NEON;
}
}
@@ -540,7 +548,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_SCALEUVFILTERCOLS_MSA)
if (filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
- if (IS_ALIGNED(dst_width, 8)) {
+ if (IS_ALIGNED(dst_width, 16)) {
ScaleUVFilterCols = ScaleUVFilterCols_MSA;
}
}
@@ -553,7 +561,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_SCALEUVCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleUVFilterCols = ScaleUVCols_Any_NEON;
- if (IS_ALIGNED(dst_width, 8)) {
+ if (IS_ALIGNED(dst_width, 16)) {
ScaleUVFilterCols = ScaleUVCols_NEON;
}
}
@@ -569,7 +577,7 @@ static void ScaleUVBilinearUp(int src_width,
#if defined(HAS_SCALEUVCOLS_MSA)
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleUVFilterCols = ScaleUVCols_Any_MSA;
- if (IS_ALIGNED(dst_width, 4)) {
+ if (IS_ALIGNED(dst_width, 8)) {
ScaleUVFilterCols = ScaleUVCols_MSA;
}
}
@@ -836,7 +844,6 @@ static void ScaleUV(const uint8_t* src,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
-
#if HAS_SCALEUVBILINEARUP
if (filtering && dy < 65536) {
ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc
index 65aa46e0dae..e05ff15640c 100644
--- a/chromium/third_party/libyuv/unit_test/planar_test.cc
+++ b/chromium/third_party/libyuv/unit_test/planar_test.cc
@@ -3562,4 +3562,68 @@ TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_uv_c);
}
+TEST_F(LibYUVPlanarTest, NV12Copy) {
+ const int halfwidth = (benchmark_width_ + 1) >> 1;
+ const int halfheight = (benchmark_height_ + 1) >> 1;
+ align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_uv, halfwidth * 2 * halfheight);
+ align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight);
+
+ MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_uv, halfwidth * 2 * halfheight);
+ MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(dst_uv, halfwidth * 2 * halfheight);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y,
+ benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(src_y[i], dst_y[i]);
+ }
+ for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+ EXPECT_EQ(src_uv[i], dst_uv[i]);
+ }
+
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_uv);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVPlanarTest, NV21Copy) {
+ const int halfwidth = (benchmark_width_ + 1) >> 1;
+ const int halfheight = (benchmark_height_ + 1) >> 1;
+ align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_vu, halfwidth * 2 * halfheight);
+ align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight);
+
+ MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_vu, halfwidth * 2 * halfheight);
+ MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+ MemRandomize(dst_vu, halfwidth * 2 * halfheight);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y,
+ benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(src_y[i], dst_y[i]);
+ }
+ for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+ EXPECT_EQ(src_vu[i], dst_vu[i]);
+ }
+
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_vu);
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
index 2fdf5f60341..c04a236a1a4 100644
--- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -312,6 +312,21 @@ TEST_SCALETO(ARGBScale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
+#define TEST_SCALESWAPXY1(name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \
+ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(ARGBScale, None, 0)
+TEST_SCALESWAPXY1(ARGBScale, Linear, 0)
+TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0)
+#undef TEST_SCALESWAPXY1
+
// Scale with YUV conversion to ARGB and clipping.
// TODO(fbarchard): Add fourcc support. All 4 ARGB formats is easy to support.
LIBYUV_API
diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc
index aa36202e825..d5294110be1 100644
--- a/chromium/third_party/libyuv/unit_test/scale_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_test.cc
@@ -771,6 +771,58 @@ TEST_SCALETO(Scale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
+#define TEST_SCALESWAPXY1(DISABLED_, name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, I420##name##SwapXY_##filter) { \
+ int diff = I420TestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) { \
+ int diff = I444TestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) { \
+ int diff = I420TestFilter_16(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) { \
+ int diff = I444TestFilter_16(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) { \
+ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, \
+ benchmark_height_, benchmark_width_, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test scale to a specified size with all 4 filters.
+#ifdef ENABLE_SLOW_TESTS
+TEST_SCALESWAPXY1(, Scale, None, 0)
+TEST_SCALESWAPXY1(, Scale, Linear, 3)
+TEST_SCALESWAPXY1(, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(, Scale, Box, 3)
+#else
+TEST_SCALESWAPXY1(DISABLED_, Scale, None, 0)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Linear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3)
+#endif
+
+#undef TEST_SCALESWAPXY1
+
#ifdef ENABLE_ROW_TESTS
#ifdef HAS_SCALEROWDOWN2_SSSE3
TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
@@ -1052,4 +1104,153 @@ TEST_FACTOR(3, 1, 3, 0)
#undef TEST_FACTOR
#undef SX
#undef DX
+
+TEST_F(LibYUVScaleTest, PlaneTest3x) {
+ const int kSrcStride = 48;
+ const int kDstStride = 16;
+ const int kSize = kSrcStride * 3;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 48 * 3; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations16 =
+ benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+ for (int i = 0; i < iterations16; ++i) {
+ ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ(49, dest_pixels[0]);
+
+ ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(49, dest_pixels[0]);
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest4x) {
+ const int kSrcStride = 64;
+ const int kDstStride = 16;
+ const int kSize = kSrcStride * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < 64 * 4; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_pixels, kDstStride);
+
+ int iterations16 =
+ benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
+ for (int i = 0; i < iterations16; ++i) {
+ ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterBilinear);
+ }
+
+ EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]);
+
+ ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+ kFilterNone);
+
+ EXPECT_EQ(130, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
+
+ free_aligned_buffer_page_end(dest_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_, benchmark_height_,
+ benchmark_width_, kFilterNone);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+ benchmark_height_, dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_, kFilterNone);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_, benchmark_height_,
+ benchmark_width_, kFilterBilinear);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+ benchmark_height_, dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_, kFilterBilinear);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
+ const int kSize = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(orig_pixels, kSize);
+ for (int i = 0; i < kSize; ++i) {
+ orig_pixels[i] = i;
+ }
+ align_buffer_page_end(dest_opt_pixels, kSize);
+ align_buffer_page_end(dest_c_pixels, kSize);
+
+ MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization.
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+ dest_c_pixels, benchmark_height_, benchmark_height_,
+ benchmark_width_, kFilterBox);
+ MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization.
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+ benchmark_height_, dest_opt_pixels, benchmark_height_,
+ benchmark_height_, benchmark_width_, kFilterBox);
+ }
+
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+ }
+
+ free_aligned_buffer_page_end(dest_c_pixels);
+ free_aligned_buffer_page_end(dest_opt_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_uv_test.cc b/chromium/third_party/libyuv/unit_test/scale_uv_test.cc
index b62bf3ad779..e45a25da4a5 100644
--- a/chromium/third_party/libyuv/unit_test/scale_uv_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_uv_test.cc
@@ -176,6 +176,21 @@ TEST_SCALETO(UVScale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
+#define TEST_SCALESWAPXY1(name, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \
+ int diff = \
+ UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_, \
+ benchmark_width_, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(UVScale, None, 0)
+TEST_SCALESWAPXY1(UVScale, Linear, 0)
+TEST_SCALESWAPXY1(UVScale, Bilinear, 0)
+#undef TEST_SCALESWAPXY1
+
TEST_F(LibYUVScaleTest, UVTest3x) {
const int kSrcStride = 48 * 2;
const int kDstStride = 16 * 2;
diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk
index c4307a431f9..b0a344ae06d 100644
--- a/chromium/third_party/libyuv/winarm.mk
+++ b/chromium/third_party/libyuv/winarm.mk
@@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \
source/scale_any.o\
source/scale_argb.o\
source/scale_common.o\
+ source/scale_uv.o\
source/video_common.o
.cc.o: