diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-05-20 09:47:09 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-06-07 11:15:42 +0000 |
commit | 189d4fd8fad9e3c776873be51938cd31a42b6177 (patch) | |
tree | 6497caeff5e383937996768766ab3bb2081a40b2 /chromium/third_party/libyuv | |
parent | 8bc75099d364490b22f43a7ce366b366c08f4164 (diff) | |
download | qtwebengine-chromium-189d4fd8fad9e3c776873be51938cd31a42b6177.tar.gz |
BASELINE: Update Chromium to 90.0.4430.221
Change-Id: Iff4d9d18d2fcf1a576f3b1f453010f744a232920
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/libyuv')
28 files changed, 8150 insertions, 7730 deletions
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS index 3558eb56c71..de185434500 100644 --- a/chromium/third_party/libyuv/DEPS +++ b/chromium/third_party/libyuv/DEPS @@ -1,24 +1,18 @@ -gclient_gn_args_file = 'src/build/config/gclient_args.gni' -gclient_gn_args = [ - 'mac_xcode_version', -] - vars = { 'chromium_git': 'https://chromium.googlesource.com', - 'chromium_revision': '5aaa70b53c3cbb80a3bcf11d17c6806729f65a8b', + 'chromium_revision': '64c8c30faaf969c15c028131dfcd0819208039c1', 'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94', - 'mac_xcode_version': 'default', } deps = { 'src/build': - Var('chromium_git') + '/chromium/src/build' + '@' + 'bddddb36e86375a17ccb10ab243a7c19c686268b', + Var('chromium_git') + '/chromium/src/build' + '@' + '2d2f9f2b85592bb9af5753ef300c055e6feb709f', 'src/buildtools': - Var('chromium_git') + '/chromium/src/buildtools' + '@' + '98881a1297863de584fad20fb671e8c44ad1a7d0', + Var('chromium_git') + '/chromium/src/buildtools' + '@' + '6302c1175607a436e18947a5abe9df2209e845fc', 'src/testing': - Var('chromium_git') + '/chromium/src/testing' + '@' + '7ade79a849fc4988e4418161920a410cb3b9f5ae', + Var('chromium_git') + '/chromium/src/testing' + '@' + '40b44171056045ed1f85ca0b57485e46c03d7867', 'src/third_party': - Var('chromium_git') + '/chromium/src/third_party' + '@' + '0e4b552d799aa8978d441662e51bbd8da69a4f71', + Var('chromium_git') + '/chromium/src/third_party' + '@' + '24ccdf9b7553446791983bf357261c5e0a4314a0', 'src/buildtools/linux64': { 'packages': [ @@ -61,13 +55,13 @@ deps = { Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'd999d54f4bca789543a2eb6c995af2d9b5a1f3ed', 'src/third_party/catapult': - Var('chromium_git') + '/catapult.git' + '@' + '7030291356e4ba8dd8a59c7a632764e5b8238a14', + Var('chromium_git') + '/catapult.git' + '@' + 'ccc9dd2835f5a7c5c82ae3c1a2fbc2fe2fd9dfd1', 'src/third_party/colorama/src': Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8', 'src/third_party/depot_tools': - Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + 'c7be37e121739c84b8dad04a23434e02177b9b2d', + Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '91bb7506bd20ed22b8787e7a8b9975cc07e97175', 'src/third_party/freetype/src': - Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'e9a7015ec876bc79a345af03bb756a0980345498', + Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '26e2a89598d69c7aba76c83f6a1fcf1db17574ab', 'src/third_party/googletest/src': Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '4fe018038f87675c083d0cfb6a6b57c274fb1753', 'src/third_party/harfbuzz-ng/src': @@ -79,7 +73,7 @@ deps = { 'src/third_party/yasm/source/patched-yasm': Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad', 'src/tools': - Var('chromium_git') + '/chromium/src/tools' + '@' + '72035b43be1620ac058061bfb6f58d3a51d3bcc3', + Var('chromium_git') + '/chromium/src/tools' + '@' + '1bb7c085e67a0fc8c63511af83299d1632f5a3f3', 'src/tools/swarming_client': Var('chromium_git') + '/infra/luci/client-py.git' + '@' + 'd46ea7635f2911208268170512cb611412488fd8', @@ -112,9 +106,9 @@ deps = { 'condition': 'checkout_android', }, 'src/third_party/boringssl/src': - 'https://boringssl.googlesource.com/boringssl.git' + '@' + 'a673d02458b1b7d897084266b93d5c610e36bd17', + 'https://boringssl.googlesource.com/boringssl.git' + '@' + '1607f54fed72c6589d560254626909a64124f091', 'src/base': { - 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e5c8a2271eca4665e90a3c209db9fbf478c9ecfb', + 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e096814b0448fba1095c6e7be7c7a0b5d7264251', 'condition': 'checkout_android', }, 'src/third_party/bazel': { @@ -263,7 +257,7 @@ deps = { }, 'src/third_party/icu': { - 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c7c91f829d1d5421be329536811d9336af09b27d', + 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'c2a4cae149aae7fd30c4cbe3cf1b30df03b386f1', }, 'src/third_party/icu4j': { 'packages': [ @@ -370,7 +364,7 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/robolectric/robolectric': { - 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + 'dc8c5f555f0f542dffc71b5fb80f52fe4ea946e2', + 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '2f3e0a3ac450a17dbf2e7d4eaab3a1f14dda50e6', 'condition': 'checkout_android', }, 'src/third_party/sqlite4java': { @@ -414,7 +408,7 @@ deps = { # iOS deps: 'src/ios': { - 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '5900cb114e36aeff8db416b8436aa0e91632afa5', + 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '60ef55beac67e3c0eda1c35ab7944c786b377313', 'condition': 'checkout_ios' }, diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium index 4ff4ea445d1..578228da455 100644 --- a/chromium/third_party/libyuv/README.chromium +++ b/chromium/third_party/libyuv/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1767 +Version: 1770 License: BSD License File: LICENSE diff --git a/chromium/third_party/libyuv/include/libyuv.h b/chromium/third_party/libyuv/include/libyuv.h index aeffd5ef7a4..a06e1233abb 100644 --- a/chromium/third_party/libyuv/include/libyuv.h +++ b/chromium/third_party/libyuv/include/libyuv.h @@ -26,6 +26,7 @@ #include "libyuv/scale.h" #include "libyuv/scale_argb.h" #include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" #include "libyuv/version.h" #include "libyuv/video_common.h" diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h index 9e0038f4745..8d868b95425 100644 --- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h +++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h @@ -200,6 +200,16 @@ int I444Copy(const uint8_t* src_y, int width, int height); +// Copy NV12. Supports inverting. +int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, + int src_stride_uv, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_uv, int dst_stride_uv, int width, int height); + +// Copy NV21. Supports inverting. +int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, + int src_stride_vu, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_vu, int dst_stride_vu, int width, int height); + // Convert YUY2 to I422. LIBYUV_API int YUY2ToI422(const uint8_t* src_yuy2, diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h index 95ecef89266..a386d499895 100644 --- a/chromium/third_party/libyuv/include/libyuv/scale_row.h +++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h @@ -113,6 +113,7 @@ extern "C" { #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON +#define HAS_SCALEUVROWDOWNEVEN_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h index 1d085960e39..3c632b3ab06 100644 --- a/chromium/third_party/libyuv/include/libyuv/version.h +++ b/chromium/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1767 +#define LIBYUV_VERSION 1770 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/chromium/third_party/libyuv/source/compare_gcc.cc b/chromium/third_party/libyuv/source/compare_gcc.cc index 676527c1b1b..6700f9697e0 100644 --- a/chromium/third_party/libyuv/source/compare_gcc.cc +++ b/chromium/third_party/libyuv/source/compare_gcc.cc @@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, uint64_t diff = 0u; asm volatile( - "xor %3,%3 \n" - "xor %%r8,%%r8 \n" - "xor %%r9,%%r9 \n" - "xor %%r10,%%r10 \n" + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" // Process 32 bytes per loop. LABELALIGN "1: \n" - "mov (%0),%%rcx \n" - "mov 0x8(%0),%%rdx \n" - "xor (%1),%%rcx \n" - "xor 0x8(%1),%%rdx \n" - "popcnt %%rcx,%%rcx \n" - "popcnt %%rdx,%%rdx \n" - "mov 0x10(%0),%%rsi \n" - "mov 0x18(%0),%%rdi \n" - "xor 0x10(%1),%%rsi \n" - "xor 0x18(%1),%%rdi \n" - "popcnt %%rsi,%%rsi \n" - "popcnt %%rdi,%%rdi \n" - "add $0x20,%0 \n" - "add $0x20,%1 \n" - "add %%rcx,%3 \n" - "add %%rdx,%%r8 \n" - "add %%rsi,%%r9 \n" - "add %%rdi,%%r10 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" - "add %%r8, %3 \n" - "add %%r9, %3 \n" - "add %%r10, %3 \n" + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, // Process 16 bytes per loop. LABELALIGN "1: \n" - "mov (%0),%%ecx \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%ecx \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%ecx \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%ecx \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, uint32_t diff = 0u; asm volatile( - "movdqa %4,%%xmm2 \n" - "movdqa %5,%%xmm3 \n" - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub %0,%1 \n" + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqa (%0),%%xmm4 \n" - "movdqa 0x10(%0), %%xmm5 \n" - "pxor (%0,%1), %%xmm4 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pand %%xmm2,%%xmm6 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm6,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "paddb %%xmm7,%%xmm6 \n" - "pxor 0x10(%0,%1),%%xmm5 \n" - "add $0x20,%0 \n" - "movdqa %%xmm5,%%xmm4 \n" - "pand %%xmm2,%%xmm5 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm5,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm4,%%xmm5 \n" - "paddb %%xmm7,%%xmm5 \n" - "paddb %%xmm5,%%xmm6 \n" - "psadbw %%xmm1,%%xmm6 \n" - "paddd %%xmm6,%%xmm0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" - "pshufd $0xaa,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0, %3 \n" + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, asm volatile( "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" - "vpxor %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm1,%%ymm1,%%ymm1 \n" - "sub %0,%1 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "vmovdqa (%0),%%ymm4 \n" - "vmovdqa 0x20(%0), %%ymm5 \n" - "vpxor (%0,%1), %%ymm4, %%ymm4 \n" - "vpand %%ymm2,%%ymm4,%%ymm6 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" - "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" - "add $0x40,%0 \n" - "vpand %%ymm2,%%ymm4,%%ymm5 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" - "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" - "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" - "sub $0x40,%2 \n" - "jg 1b \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" - "vpermq $0xb1,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xaa,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vmovd %%xmm0, %3 \n" + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" "vzeroupper \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; asm volatile( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" : "+r"(src), // %0 "+r"(count), // %1 "+rm"(seed), // %2 diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc index 2a2181e0cb3..afdd6012164 100644 --- a/chromium/third_party/libyuv/source/compare_neon.cc +++ b/chromium/third_party/libyuv/source/compare_neon.cc @@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, uint32_t diff; asm volatile( - "vmov.u16 q4, #0 \n" // accumulator + "vmov.u16 q4, #0 \n" // accumulator "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" - "vld1.8 {q2, q3}, [%1]! \n" - "veor.32 q0, q0, q2 \n" - "veor.32 q1, q1, q3 \n" - "vcnt.i8 q0, q0 \n" - "vcnt.i8 q1, q1 \n" - "subs %2, %2, #32 \n" - "vadd.u8 q0, q0, q1 \n" // 16 byte counts - "vpadal.u8 q4, q0 \n" // 8 shorts - "bgt 1b \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" - "vpaddl.u16 q0, q4 \n" // 4 ints - "vpadd.u32 d0, d0, d1 \n" - "vpadd.u32 d0, d0, d0 \n" - "vmov.32 %3, d0[0] \n" + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : @@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); diff --git a/chromium/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libyuv/source/compare_neon64.cc index a22ba75b330..70fb9b9143f 100644 --- a/chromium/third_party/libyuv/source/compare_neon64.cc +++ b/chromium/third_party/libyuv/source/compare_neon64.cc @@ -27,24 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, int count) { uint32_t diff; asm volatile( - "movi v4.8h, #0 \n" + "movi v4.8h, #0 \n" "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "add v0.16b, v0.16b, v1.16b \n" - "uadalp v4.8h, v0.16b \n" - "b.gt 1b \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "uaddlv s4, v4.8h \n" - "fmov %w3, s4 \n" + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : : "cc", "v0", "v1", "v2", "v3", "v4"); @@ -56,30 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc index 936d7d34088..fe89452b772 100644 --- a/chromium/third_party/libyuv/source/cpu_id.cc +++ b/chromium/third_party/libyuv/source/cpu_id.cc @@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) { asm volatile( #if defined(__i386__) && defined(__PIC__) // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" + "mov %%ebx, %%edi \n" "cpuid \n" - "xchg %%edi, %%ebx \n" + "xchg %%edi, %%ebx \n" : "=D"(info_ebx), #else "cpuid \n" diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc index d5cd7e6808e..4e8908c2eba 100644 --- a/chromium/third_party/libyuv/source/planar_functions.cc +++ b/chromium/third_party/libyuv/source/planar_functions.cc @@ -349,6 +349,39 @@ int I420ToI400(const uint8_t* src_y, return 0; } +// Copy NV12. Supports inverting. +int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, + int src_stride_uv, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { + if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2, + halfheight); + return 0; +} + +// Copy NV21. Supports inverting. +int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, + int src_stride_vu, uint8_t* dst_y, int dst_stride_y, + uint8_t* dst_vu, int dst_stride_vu, int width, int height) { + return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_vu, dst_stride_vu, width, height); +} + // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API diff --git a/chromium/third_party/libyuv/source/rotate_gcc.cc b/chromium/third_party/libyuv/source/rotate_gcc.cc index 04e19e29eef..fd359d4ae69 100644 --- a/chromium/third_party/libyuv/source/rotate_gcc.cc +++ b/chromium/third_party/libyuv/source/rotate_gcc.cc @@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc index fdc0dd476c6..844df2bf305 100644 --- a/chromium/third_party/libyuv/source/rotate_neon.cc +++ b/chromium/third_party/libyuv/source/rotate_neon.cc @@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src, // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "vld1.8 {d0}, [%0], %2 \n" - "vld1.8 {d1}, [%0], %2 \n" - "vld1.8 {d2}, [%0], %2 \n" - "vld1.8 {d3}, [%0], %2 \n" - "vld1.8 {d4}, [%0], %2 \n" - "vld1.8 {d5}, [%0], %2 \n" - "vld1.8 {d6}, [%0], %2 \n" - "vld1.8 {d7}, [%0] \n" - - "vtrn.8 d1, d0 \n" - "vtrn.8 d3, d2 \n" - "vtrn.8 d5, d4 \n" - "vtrn.8 d7, d6 \n" - - "vtrn.16 d1, d3 \n" - "vtrn.16 d0, d2 \n" - "vtrn.16 d5, d7 \n" - "vtrn.16 d4, d6 \n" - - "vtrn.32 d1, d5 \n" - "vtrn.32 d0, d4 \n" - "vtrn.32 d3, d7 \n" - "vtrn.32 d2, d6 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - - "mov %0, %3 \n" - - "vst1.8 {d1}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d3}, [%0], %4 \n" - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d5}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d7}, [%0], %4 \n" - "vst1.8 {d6}, [%0] \n" - - "add %1, #8 \n" // src += 8 - "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride - "subs %5, #8 \n" // w -= 8 - "bge 1b \n" + "mov %0, %1 \n" + + "vld1.8 {d0}, [%0], %2 \n" + "vld1.8 {d1}, [%0], %2 \n" + "vld1.8 {d2}, [%0], %2 \n" + "vld1.8 {d3}, [%0], %2 \n" + "vld1.8 {d4}, [%0], %2 \n" + "vld1.8 {d5}, [%0], %2 \n" + "vld1.8 {d6}, [%0], %2 \n" + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + "vst1.8 {d1}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d3}, [%0], %4 \n" + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d5}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d7}, [%0], %4 \n" + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. @@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src, // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "vld2.8 {d0, d1}, [%0], %2 \n" - "vld2.8 {d2, d3}, [%0], %2 \n" - "vld2.8 {d4, d5}, [%0], %2 \n" - "vld2.8 {d6, d7}, [%0], %2 \n" - "vld2.8 {d16, d17}, [%0], %2 \n" - "vld2.8 {d18, d19}, [%0], %2 \n" - "vld2.8 {d20, d21}, [%0], %2 \n" - "vld2.8 {d22, d23}, [%0] \n" - - "vtrn.8 q1, q0 \n" - "vtrn.8 q3, q2 \n" - "vtrn.8 q9, q8 \n" - "vtrn.8 q11, q10 \n" - - "vtrn.16 q1, q3 \n" - "vtrn.16 q0, q2 \n" - "vtrn.16 q9, q11 \n" - "vtrn.16 q8, q10 \n" - - "vtrn.32 q1, q9 \n" - "vtrn.32 q0, q8 \n" - "vtrn.32 q3, q11 \n" - "vtrn.32 q2, q10 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - "vrev16.8 q8, q8 \n" - "vrev16.8 q9, q9 \n" - "vrev16.8 q10, q10 \n" - "vrev16.8 q11, q11 \n" - - "mov %0, %3 \n" - - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d6}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d18}, [%0], %4 \n" - "vst1.8 {d16}, [%0], %4 \n" - "vst1.8 {d22}, [%0], %4 \n" - "vst1.8 {d20}, [%0] \n" - - "mov %0, %5 \n" - - "vst1.8 {d3}, [%0], %6 \n" - "vst1.8 {d1}, [%0], %6 \n" - "vst1.8 {d7}, [%0], %6 \n" - "vst1.8 {d5}, [%0], %6 \n" - "vst1.8 {d19}, [%0], %6 \n" - "vst1.8 {d17}, [%0], %6 \n" - "vst1.8 {d23}, [%0], %6 \n" - "vst1.8 {d21}, [%0] \n" - - "add %1, #8*2 \n" // src += 8*2 - "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %7, #8 \n" // w -= 8 - "bge 1b \n" + "mov %0, %1 \n" + + "vld2.8 {d0, d1}, [%0], %2 \n" + "vld2.8 {d2, d3}, [%0], %2 \n" + "vld2.8 {d4, d5}, [%0], %2 \n" + "vld2.8 {d6, d7}, [%0], %2 \n" + "vld2.8 {d16, d17}, [%0], %2 \n" + "vld2.8 {d18, d19}, [%0], %2 \n" + "vld2.8 {d20, d21}, [%0], %2 \n" + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d6}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d18}, [%0], %4 \n" + "vst1.8 {d16}, [%0], %4 \n" + "vst1.8 {d22}, [%0], %4 \n" + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.8 {d3}, [%0], %6 \n" + "vst1.8 {d1}, [%0], %6 \n" + "vst1.8 {d7}, [%0], %6 \n" + "vst1.8 {d5}, [%0], %6 \n" + "vst1.8 {d19}, [%0], %6 \n" + "vst1.8 {d17}, [%0], %6 \n" + "vst1.8 {d23}, [%0], %6 \n" + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. diff --git a/chromium/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libyuv/source/rotate_neon64.cc index 99f7ee16ca0..43c1581731d 100644 --- a/chromium/third_party/libyuv/source/rotate_neon64.cc +++ b/chromium/third_party/libyuv/source/rotate_neon64.cc @@ -34,74 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src, // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w3, %w3, #8 \n" + "sub %w3, %w3, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" "mov %0, %1 \n" - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" "mov %0, %1 \n" - "trn2 v16.8b, v0.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "trn1 v17.8b, v0.8b, v1.8b \n" - "add %0, %0, %5 \n" - "trn2 v18.8b, v2.8b, v3.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 1 - "trn1 v19.8b, v2.8b, v3.8b \n" - "add %0, %0, %5 \n" - "trn2 v20.8b, v4.8b, v5.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 2 - "trn1 v21.8b, v4.8b, v5.8b \n" - "add %0, %0, %5 \n" - "trn2 v22.8b, v6.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 3 - "trn1 v23.8b, v6.8b, v7.8b \n" - "add %0, %0, %5 \n" - - "trn2 v3.4h, v17.4h, v19.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 4 - "trn1 v1.4h, v17.4h, v19.4h \n" - "add %0, %0, %5 \n" - "trn2 v2.4h, v16.4h, v18.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 5 - "trn1 v0.4h, v16.4h, v18.4h \n" - "add %0, %0, %5 \n" - "trn2 v7.4h, v21.4h, v23.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 6 - "trn1 v5.4h, v21.4h, v23.4h \n" - "add %0, %0, %5 \n" - "trn2 v6.4h, v20.4h, v22.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 7 - "trn1 v4.4h, v20.4h, v22.4h \n" - - "trn2 v21.2s, v1.2s, v5.2s \n" - "trn1 v17.2s, v1.2s, v5.2s \n" - "trn2 v20.2s, v0.2s, v4.2s \n" - "trn1 v16.2s, v0.2s, v4.2s \n" - "trn2 v23.2s, v3.2s, v7.2s \n" - "trn1 v19.2s, v3.2s, v7.2s \n" - "trn2 v22.2s, v2.2s, v6.2s \n" - "trn1 v18.2s, v2.2s, v6.2s \n" + "trn2 v16.8b, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "trn1 v17.8b, v0.8b, v1.8b \n" + "add %0, %0, %5 \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 1 + "trn1 v19.8b, v2.8b, v3.8b \n" + "add %0, %0, %5 \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 2 + "trn1 v21.8b, v4.8b, v5.8b \n" + "add %0, %0, %5 \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 3 + "trn1 v23.8b, v6.8b, v7.8b \n" + "add %0, %0, %5 \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 4 + "trn1 v1.4h, v17.4h, v19.4h \n" + "add %0, %0, %5 \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 5 + "trn1 v0.4h, v16.4h, v18.4h \n" + "add %0, %0, %5 \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 6 + "trn1 v5.4h, v21.4h, v23.4h \n" + "add %0, %0, %5 \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 7 + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" "mov %0, %2 \n" - "st1 {v17.8b}, [%0], %6 \n" - "st1 {v16.8b}, [%0], %6 \n" - "st1 {v19.8b}, [%0], %6 \n" - "st1 {v18.8b}, [%0], %6 \n" - "st1 {v21.8b}, [%0], %6 \n" - "st1 {v20.8b}, [%0], %6 \n" - "st1 {v23.8b}, [%0], %6 \n" - "st1 {v22.8b}, [%0] \n" + "st1 {v17.8b}, [%0], %6 \n" + "st1 {v16.8b}, [%0], %6 \n" + "st1 {v19.8b}, [%0], %6 \n" + "st1 {v18.8b}, [%0], %6 \n" + "st1 {v21.8b}, [%0], %6 \n" + "st1 {v20.8b}, [%0], %6 \n" + "st1 {v23.8b}, [%0], %6 \n" + "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride @@ -110,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src, // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %w3, %w3, #8 \n" - "b.eq 4f \n" + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %w3, #2 \n" - "b.lt 3f \n" + "cmp %w3, #2 \n" + "b.lt 3f \n" - "cmp %w3, #4 \n" - "b.lt 2f \n" + "cmp %w3, #4 \n" + "b.lt 2f \n" // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.s}[0], [%0], %5 \n" - "ld1 {v0.s}[1], [%0], %5 \n" - "ld1 {v0.s}[2], [%0], %5 \n" - "ld1 {v0.s}[3], [%0], %5 \n" - "ld1 {v1.s}[0], [%0], %5 \n" - "ld1 {v1.s}[1], [%0], %5 \n" - "ld1 {v1.s}[2], [%0], %5 \n" - "ld1 {v1.s}[3], [%0] \n" + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "ld1 {v2.16b}, [%4] \n" + "ld1 {v2.16b}, [%4] \n" - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" // TODO(frkoenig): Rework shuffle above to // write out with 4 instead of 8 writes. @@ -228,90 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src, // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w4, %w4, #8 \n" + "sub %w4, %w4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "ld1 {v0.16b}, [%0], %5 \n" - "ld1 {v1.16b}, [%0], %5 \n" - "ld1 {v2.16b}, [%0], %5 \n" - "ld1 {v3.16b}, [%0], %5 \n" - "ld1 {v4.16b}, [%0], %5 \n" - "ld1 {v5.16b}, [%0], %5 \n" - "ld1 {v6.16b}, [%0], %5 \n" - "ld1 {v7.16b}, [%0] \n" - "mov %0, %1 \n" - - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" - - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" - - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "mov %0, %1 \n" - "mov %0, %2 \n" + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" + "mov %0, %1 \n" - "st1 {v16.d}[0], [%0], %6 \n" - "st1 {v18.d}[0], [%0], %6 \n" - "st1 {v17.d}[0], [%0], %6 \n" - "st1 {v19.d}[0], [%0], %6 \n" - "st1 {v16.d}[1], [%0], %6 \n" - "st1 {v18.d}[1], [%0], %6 \n" - "st1 {v17.d}[1], [%0], %6 \n" - "st1 {v19.d}[1], [%0] \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %0, %3 \n" + "mov %0, %2 \n" - "st1 {v20.d}[0], [%0], %7 \n" - "st1 {v22.d}[0], [%0], %7 \n" - "st1 {v21.d}[0], [%0], %7 \n" - "st1 {v23.d}[0], [%0], %7 \n" - "st1 {v20.d}[1], [%0], %7 \n" - "st1 {v22.d}[1], [%0], %7 \n" - "st1 {v21.d}[1], [%0], %7 \n" - "st1 {v23.d}[1], [%0] \n" - - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * // dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * // dst_stride_b - "subs %w4, %w4, #8 \n" // w -= 8 - "b.ge 1b \n" + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %w4, %w4, #8 \n" - "b.eq 4f \n" + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %w4, #2 \n" - "b.lt 3f \n" + "cmp %w4, #2 \n" + "b.lt 3f \n" - "cmp %w4, #4 \n" - "b.lt 2f \n" + "cmp %w4, #4 \n" + "b.lt 2f \n" // TODO(frkoenig): Clean this up // 4x8 block diff --git a/chromium/third_party/libyuv/source/row_gcc.cc b/chromium/third_party/libyuv/source/row_gcc.cc index c4c012ff412..a107c30e769 100644 --- a/chromium/third_party/libyuv/source/row_gcc.cc +++ b/chromium/third_party/libyuv/source/row_gcc.cc @@ -159,24 +159,24 @@ static const lvec8 kShuffleNV21 = { #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -190,35 +190,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -228,35 +228,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -267,35 +267,35 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff - "psrld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff + "psrld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 @@ -307,25 +307,25 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x4(%0),%%xmm1 \n" - "movdqu 0x8(%0),%%xmm2 \n" - "lea 0x18(%0),%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -337,44 +337,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -385,47 +385,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -436,34 +436,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -474,35 +474,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm6 \n" + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -513,35 +513,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm6 \n" + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -556,37 +556,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" + "vmovdqa %4,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -615,26 +615,26 @@ static const ulvec8 kPermARGBToRGB24_2 = { void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vmovdqa %3,%%ymm5 \n" - "vmovdqa %4,%%ymm6 \n" - "vmovdqa %5,%%ymm7 \n" + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" - "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" - "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -650,37 +650,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" + "vmovdqa %4,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -694,34 +694,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -734,40 +734,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, const uint32_t dither4, int width) { asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -783,35 +783,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -824,38 +824,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -865,26 +865,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -928,31 +928,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -967,31 +967,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -1008,25 +1008,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -1045,25 +1045,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -1150,9 +1150,9 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_argb), // %0 @@ -1171,8 +1171,8 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) : "+r"(src_argb), // %0 @@ -1189,8 +1189,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Same as ARGBToYRow but different coefficients, no add 16. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) : "+r"(src_rgba), // %0 @@ -1212,7 +1212,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_argb), // %0 @@ -1234,7 +1234,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_abgr), // %0 @@ -1255,7 +1255,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm5) : "+r"(src_argb), // %0 @@ -1275,7 +1275,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1296,52 +1296,52 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1368,44 +1368,44 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1431,44 +1431,44 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 @@ -1494,45 +1494,45 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1555,53 +1555,53 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1620,47 +1620,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1674,9 +1674,9 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_bgra), // %0 @@ -1695,52 +1695,52 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1754,9 +1754,9 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_abgr), // %0 @@ -1771,9 +1771,9 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_rgba), // %0 @@ -1792,52 +1792,52 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1855,52 +1855,52 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_rgba0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2117,16 +2117,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2146,27 +2146,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" - "subl $0x8,%[width] \n" - "jg 1b \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2192,16 +2192,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2221,21 +2221,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2256,16 +2256,16 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2286,21 +2286,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2323,15 +2323,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2358,15 +2358,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2386,15 +2386,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2414,15 +2414,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2442,15 +2442,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2471,16 +2471,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2695,17 +2695,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2729,18 +2729,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2764,23 +2764,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2804,18 +2804,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2839,23 +2839,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2881,16 +2881,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2920,11 +2920,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -2964,16 +2964,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2997,16 +2997,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -3030,16 +3030,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -3063,16 +3063,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -3092,10 +3092,10 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 - "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 - "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 - "pslld $0x18,%%xmm4 \n" + "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 + "pslld $0x18,%%xmm4 \n" LABELALIGN "1: \n" @@ -3137,10 +3137,10 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 - "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 - "vpslld $0x18,%%ymm4,%%ymm4 \n" + "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 + "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" @@ -3182,16 +3182,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %3,%%xmm5 \n" + "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -3209,13 +3209,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { LABELALIGN "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3234,16 +3234,16 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %3,%%xmm5 \n" + "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -3261,13 +3261,13 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { LABELALIGN "1: \n" - "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 @@ -3287,20 +3287,20 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -3327,27 +3327,27 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" // first 5 - "movdqu 15(%0),%%xmm1 \n" // next 5 - "movdqu 30(%0),%%xmm2 \n" // next 5 - "movdqu 32(%0),%%xmm3 \n" // last 1 special - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm4,%%xmm2 \n" - "pshufb %%xmm5,%%xmm3 \n" - "lea -0x30(%0),%0 \n" - "movdqu %%xmm0,32(%1) \n" // last 5 - "movdqu %%xmm1,17(%1) \n" // next 5 - "movdqu %%xmm2,2(%1) \n" // next 5 - "movlpd %%xmm3,0(%1) \n" // first 1 - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" // first 5 + "movdqu 15(%0),%%xmm1 \n" // next 5 + "movdqu 30(%0),%%xmm2 \n" // next 5 + "movdqu 32(%0),%%xmm3 \n" // last 1 special + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm4,%%xmm2 \n" + "pshufb %%xmm5,%%xmm3 \n" + "lea -0x30(%0),%0 \n" + "movdqu %%xmm0,32(%1) \n" // last 5 + "movdqu %%xmm1,17(%1) \n" // next 5 + "movdqu %%xmm2,2(%1) \n" // next 5 + "movlpd %%xmm3,0(%1) \n" // first 1 + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_rgb24), // %1 "+r"(temp_width) // %2 @@ -3363,17 +3363,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "lea -0x10(%0,%2,4),%0 \n" + "lea -0x10(%0,%2,4),%0 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -3389,15 +3389,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "vmovdqu %3,%%ymm5 \n" + "vmovdqu %3,%%ymm5 \n" LABELALIGN "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3413,28 +3413,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm2,0x00(%1,%2,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -3451,28 +3451,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -3489,22 +3489,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" "vextractf128 $0x0,%%ymm0,0x10(%2) \n" "vextractf128 $0x1,%%ymm2,0x20(%2) \n" "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 @@ -3522,21 +3522,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -3559,30 +3559,30 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, int width) { // clang-format off asm volatile ( - "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1,1),%%ymm1 \n" - "add $0x20,%0 \n" - - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates - "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "add $0x40,%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -3605,24 +3605,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3643,23 +3643,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "add $0x20,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3675,25 +3675,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "add $0x20,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3714,25 +3714,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, int width) { // clang-format off asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "add $0x10,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "add $0x20,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3748,26 +3748,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3819,41 +3819,41 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "lea 0x30(%0),%0 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -3914,42 +3914,42 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,16(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,32(%3) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%2),%2 \n" - "lea 0x30(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -3971,35 +3971,35 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" LABELALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" LABELALIGN "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4014,14 +4014,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4036,7 +4036,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep movsb \n" + "rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -4049,29 +4049,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4084,21 +4084,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "vmovdqu 0x20(%0),%%ymm2 \n" - "lea 0x40(%0),%0 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -4117,17 +4117,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0), %%xmm0 \n" - "movdqu 0x10(%0), %%xmm1 \n" - "lea 0x20(%0), %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1), %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+rm"(width) // %2 @@ -4145,28 +4145,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "vmovdqa %3,%%ymm4 \n" + "vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0), %%ymm0 \n" - "vmovdqu 0x20(%0), %%ymm1 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu 0x40(%0), %%ymm2 \n" - "vmovdqu 0x60(%0), %%ymm3 \n" - "lea 0x80(%0), %0 \n" - "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20, %2 \n" - "jg 1b \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 @@ -4181,31 +4181,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm2 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4218,23 +4218,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" - "vpmovzxbd (%0),%%ymm1 \n" - "vpmovzxbd 0x8(%0),%%ymm2 \n" - "lea 0x10(%0),%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -4250,7 +4250,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. asm volatile( - "rep stosl \n" + "rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -4261,7 +4261,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep stosb \n" + "rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -4272,7 +4272,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep stosl \n" + "rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -4283,21 +4283,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -4311,32 +4311,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4350,28 +4350,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4385,16 +4385,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -4408,32 +4408,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4447,28 +4447,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4481,22 +4481,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -4511,32 +4511,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -4551,30 +4551,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -4589,17 +4589,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -4613,32 +4613,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -4653,30 +4653,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -4698,71 +4698,71 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" + "add $0x3,%3 \n" + "jl 99f \n" // 1 pixel loop. "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4786,36 +4786,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(alpha), // %2 @@ -4838,43 +4838,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" // 32 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src0), // %0 "+r"(src1), // %1 @@ -4898,35 +4898,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -4947,29 +4947,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4989,32 +4989,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movzb 0x03(%0),%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x07(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 @@ -5034,52 +5034,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, int width) { uintptr_t alpha; asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" // 8 pixel loop. LABELALIGN "1: \n" // replace VPGATHER - "movzb 0x03(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x07(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "movzb 0x13(%0),%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x17(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x1b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x1f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" // end of VPGATHER - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -5096,42 +5096,42 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psubb %%xmm5,%%xmm0 \n" - "psubb %%xmm5,%%xmm1 \n" - "movdqu %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "movdqu %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "phaddw %%xmm0,%%xmm6 \n" - "paddw %%xmm5,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm6,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psubb %%xmm5,%%xmm0 \n" + "psubb %%xmm5,%%xmm1 \n" + "movdqu %%xmm4,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "movdqu %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "phaddw %%xmm0,%%xmm6 \n" + "paddw %%xmm5,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm6,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm6,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5158,50 +5158,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { asm volatile( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "m"(kARGBToSepiaB), // %2 @@ -5219,54 +5219,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "movdqu (%3),%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm6,0x10(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5284,40 +5284,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu (%0),%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -5335,27 +5335,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5372,28 +5372,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, int width) { asm volatile( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -5411,26 +5411,26 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, int width) { asm volatile( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -5456,15 +5456,15 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -5484,14 +5484,14 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpaddusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -5512,15 +5512,15 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -5540,14 +5540,14 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpsubusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -5569,40 +5569,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0, uint8_t* dst_sobelx, int width) { asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "movq 0x02(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x00(%0,%2,1),%%xmm2 \n" - "movq 0x02(%0,%2,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -5623,39 +5623,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0, uint8_t* dst_sobely, int width) { asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x01(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x02(%0,%1,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -5676,37 +5676,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "movdqu %%xmm3,0x20(%2) \n" - "movdqu %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -5723,21 +5723,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_y, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -5758,36 +5758,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6,(%2) \n" - "movdqu %%xmm4,0x10(%2) \n" - "movdqu %%xmm7,0x20(%2) \n" - "movdqu %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -5806,67 +5806,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, const int32_t* previous_cumsum, int width) { asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu 0x10(%2),%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu 0x20(%2),%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu 0x30(%2),%%xmm5 \n" - "lea 0x40(%2),%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "movdqu %%xmm4,0x20(%1) \n" - "movdqu %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "add $0x3,%3 \n" + "jl 19f \n" // 1 pixel loop. LABELALIGN "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "lea 0x10(%2),%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" "19: \n" : "+r"(row), // %0 @@ -5886,119 +5886,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, uint8_t* dst, int count) { asm volatile( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" // 4 pixel small loop. LABELALIGN "4: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" // 4 pixel loop LABELALIGN "40: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "add $0x3,%3 \n" + "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" - "movdqu (%0),%%xmm0 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" "19: \n" : "+r"(topleft), // %0 "+r"(botleft), // %1 @@ -6021,70 +6021,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile( - "movq (%3),%%xmm2 \n" - "movq 0x08(%3),%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" // 4 pixel loop LABELALIGN "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" + "add $0x3,%4 \n" + "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x04(%2),%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 "+r"(src_argb_stride_temp), // %1 @@ -6106,68 +6106,68 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" // General purpose row blend. LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -6187,61 +6187,61 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" "vbroadcastss %%xmm4,%%ymm4 \n" // General purpose row blend. LABELALIGN "1: \n" - "vmovdqu (%1),%%ymm0 \n" - "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "vmovdqu (%1),%%ymm0 \n" - "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "rep movsb \n" - "jmp 999f \n" + "rep movsb \n" + "jmp 999f \n" "99: \n" "vzeroupper \n" @@ -6263,20 +6263,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, int width) { asm volatile( - "movdqu (%3),%%xmm5 \n" + "movdqu (%3),%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -6297,16 +6297,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -6324,24 +6324,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6360,24 +6360,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6396,26 +6396,26 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" "vextractf128 $0x0,%%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -6435,26 +6435,26 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" "vextractf128 $0x0,%%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -6473,47 +6473,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, int width) { asm volatile( - "pxor %%xmm3,%%xmm3 \n" + "pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -6609,27 +6609,27 @@ void HalfFloatRow_AVX2(const uint16_t* src, int width) { scale *= kScaleBias; asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm2 \n" // 16 shorts - "add $0x20,%0 \n" - "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates - "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vpsrld $0xd,%%ymm3,%%ymm3 \n" - "vpsrld $0xd,%%ymm2,%%ymm2 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates - "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -6650,8 +6650,8 @@ void HalfFloatRow_F16C(const uint16_t* src, float scale, int width) { asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "sub %0,%1 \n" + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN @@ -6685,7 +6685,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" @@ -6719,21 +6719,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb, // 1 pixel loop. LABELALIGN "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "movzb -0x1(%0),%1 \n" - "movzb 0x03(%3,%1,4),%1 \n" - "mov %b1,-0x1(%0) \n" - "dec %2 \n" - "jg 1b \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 @@ -6752,18 +6752,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, // 1 pixel loop. LABELALIGN "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "dec %2 \n" - "jg 1b \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 @@ -6782,86 +6782,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uintptr_t pixel_temp; uintptr_t table_temp; asm volatile( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%2),%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb (%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,(%3) \n" - "movzb 0x1(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x1(%3) \n" - "movzb 0x2(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x2(%3) \n" - "movzb 0x3(%2),%0 \n" - "mov %b0,0x3(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x4(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x4(%3) \n" - "movzb 0x5(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x5(%3) \n" - "movzb 0x6(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x6(%3) \n" - "movzb 0x7(%2),%0 \n" - "mov %b0,0x7(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x8(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x8(%3) \n" - "movzb 0x9(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x9(%3) \n" - "movzb 0xa(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xa(%3) \n" - "movzb 0xb(%2),%0 \n" - "mov %b0,0xb(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb 0xc(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xc(%3) \n" - "movzb 0xd(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xd(%3) \n" - "movzb 0xe(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xe(%3) \n" - "movzb 0xf(%2),%0 \n" - "mov %b0,0xf(%3) \n" - "lea 0x10(%2),%2 \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" : "=&d"(pixel_temp), // %0 "=&a"(table_temp), // %1 "+r"(src_argb), // %2 @@ -6934,46 +6934,47 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, src_y_ptr = (uint8_t*)src_y; asm volatile( - "vmovdqu %5, %%ymm0 \n" // init blend value - "vmovdqu %6, %%ymm1 \n" // init blend value - "vmovdqu %7, %%ymm2 \n" // init blend value - // "sub $0x20, %3 \n" //sub 32 from width for final loop + "vmovdqu %5, %%ymm0 \n" // init blend value + "vmovdqu %6, %%ymm1 \n" // init blend value + "vmovdqu %7, %%ymm2 \n" // init blend value + // "sub $0x20, %3 \n" //sub 32 from + // width for final loop LABELALIGN - "1: \n" // label 1 - "vmovdqu (%0,%4), %%ymm3 \n" // src_y - "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 - "vmovdqu (%1), %%ymm5 \n" // src_uv - "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf - "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for - // shuf - "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for - // shuf - "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf - "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for - // shuf - "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 - "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 - "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 - "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 - "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const - "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results - "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h - "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results - "add $0x20, %4 \n" // add to src buffer - // ptr - "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert - "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert - "vmovdqu %%ymm4, (%2) \n" // store dst_yuv - "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h - "add $0x60,%2 \n" // add to dst buffer - // ptr - // "cmp %3, %4 \n" //(width64 - + "1: \n" // label 1 + "vmovdqu (%0,%4), %%ymm3 \n" // src_y + "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 + "vmovdqu (%1), %%ymm5 \n" // src_uv + "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf + "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for + // shuf + "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for + // shuf + "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf + "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for + // shuf + "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 + "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 + "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 + "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 + "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const + "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results + "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h + "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results + "add $0x20, %4 \n" // add to src buffer + // ptr + "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert + "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert + "vmovdqu %%ymm4, (%2) \n" // store dst_yuv + "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h + "add $0x60,%2 \n" // add to dst buffer + // ptr + // "cmp %3, %4 \n" //(width64 - // 32 bytes) and src_offset - "sub $0x20,%3 \n" // 32 pixels per loop - "jg 1b \n" - "vzeroupper \n" // sse-avx2 - // transistions + "sub $0x20,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" // sse-avx2 + // transistions : "+r"(src_y), //%0 "+r"(src_vu), //%1 @@ -7004,20 +7005,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "movdqu %3,%%xmm5 \n" + "movdqu %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -7034,16 +7035,16 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 @@ -7060,36 +7061,36 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" // load 16 U values - "movdqu (%1),%%xmm1 \n" // load 16 V values - "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row - "movdqu 0(%1,%5,1),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // half size - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x10(%1),%1 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" // store 8 UV pixels - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" // 16 src pixels per loop - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" // load 16 U values + "movdqu (%1),%%xmm1 \n" // load 16 V values + "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row + "movdqu 0(%1,%5,1),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // half size + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" // store 8 UV pixels + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" // 16 src pixels per loop + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -7113,29 +7114,29 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" // load 32 U values - "vmovdqu (%1),%%ymm1 \n" // load 32 V values - "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row - "vmovdqu 0(%1,%5,1),%%ymm3 \n" - "lea 0x20(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels - "lea 0x20(%2),%2 \n" - "sub $0x20,%3 \n" // 32 src pixels per loop - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" // load 32 U values + "vmovdqu (%1),%%ymm1 \n" // load 32 V values + "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row + "vmovdqu 0(%1,%5,1),%%ymm3 \n" + "lea 0x20(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels + "lea 0x20(%2),%2 \n" + "sub $0x20,%3 \n" // 32 src pixels per loop + "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 @@ -7148,17 +7149,17 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { asm volatile( - "pxor %%xmm1,%%xmm1 \n" + "pxor %%xmm1,%%xmm1 \n" LABELALIGN "1: \n" - "movd (%0),%%xmm0 \n" // load float - "maxss %%xmm1, %%xmm0 \n" // clamp to zero - "add 4, %0 \n" - "movd %%xmm0, (%1) \n" // store float - "add 4, %1 \n" - "sub $0x4,%2 \n" // 1 float per loop - "jg 1b \n" + "movd (%0),%%xmm0 \n" // load float + "maxss %%xmm1, %%xmm0 \n" // clamp to zero + "add 4, %0 \n" + "movd %%xmm0, (%1) \n" // store float + "add 4, %1 \n" + "sub $0x4,%2 \n" // 1 float per loop + "jg 1b \n" : "+r"(src_x), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc index b81c53ff2bd..a5aeaabfbd7 100644 --- a/chromium/third_party/libyuv/source/row_neon.cc +++ b/chromium/third_party/libyuv/source/row_neon.cc @@ -114,11 +114,11 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUV444 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -140,11 +140,11 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -168,10 +168,10 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %5, %5, #8 \n" - "vld1.8 {d23}, [%3]! \n" - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" + "subs %5, %5, #8 \n" + "vld1.8 {d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -195,10 +195,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -221,9 +221,9 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -253,9 +253,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "subs %4, %4, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -287,10 +287,10 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB1555 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -321,14 +321,14 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // vbic bits to clear + "vmov.u8 d4, #0x0f \n" // vbic bits to clear "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB4444 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -348,11 +348,11 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUV400 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -366,14 +366,14 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -387,11 +387,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -410,11 +410,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -439,9 +439,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb24), // %2 @@ -466,9 +466,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" - "bgt 1b \n" + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_rgb24), // %2 @@ -489,9 +489,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "subs %3, %3, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 @@ -509,11 +509,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -530,11 +530,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -553,11 +553,11 @@ void SplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store U - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -574,11 +574,11 @@ void MergeUVRow_NEON(const uint8_t* src_u, int width) { asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load U - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -596,13 +596,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, int width) { asm volatile( "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB - "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store R - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%3]! \n" // store B - "bgt 1b \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -621,13 +621,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q2}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB - "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -642,10 +642,10 @@ void MergeRGBRow_NEON(const uint8_t* src_r, void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers @@ -657,11 +657,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( - "vdup.8 q0, %2 \n" // duplicate 16 bytes + "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 @@ -671,11 +671,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { // ARGBSetRow writes 'width' pixels using an 32 bit value repeated. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 @@ -685,18 +685,18 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "add %0, %0, %2 \n" - "sub %0, %0, #32 \n" // 32 bytes per loop - - "1: \n" - "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 - "subs %2, #32 \n" // 32 pixels per loop. - "vrev64.8 q0, q2 \n" - "vrev64.8 q1, q1 \n" - "vswp d0, d1 \n" - "vswp d2, d3 \n" - "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 - "bgt 1b \n" + "add %0, %0, %2 \n" + "sub %0, %0, #32 \n" // 32 bytes per loop + + "1: \n" + "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 + "subs %2, #32 \n" // 32 pixels per loop. + "vrev64.8 q0, q2 \n" + "vrev64.8 q1, q1 \n" + "vswp d0, d1 \n" + "vswp d2, d3 \n" + "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -707,16 +707,16 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile( // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %2, lsl #1 \n" - "sub %0, #16 \n" + "mov r12, #-16 \n" + "add %0, %0, %2, lsl #1 \n" + "sub %0, #16 \n" "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 - "bgt 1b \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(width) // %2 @@ -730,17 +730,17 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1]! \n" // dst += 8 - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -751,18 +751,18 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "add %0, %0, %2, lsl #2 \n" - "sub %0, #32 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #32 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 d0, d0 \n" - "vrev64.8 d1, d1 \n" - "vrev64.8 d2, d2 \n" - "vrev64.8 d3, d3 \n" - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vrev64.8 d3, d3 \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -776,13 +776,13 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, src_rgb24 += width * 3 - 24; asm volatile( "1: \n" - "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 d0, d0 \n" - "vrev64.8 d1, d1 \n" - "vrev64.8 d2, d2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 - "bgt 1b \n" + "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -794,12 +794,12 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d4, #255 \n" // Alpha + "vmov.u8 d4, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -810,13 +810,13 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d4, #255 \n" // Alpha + "vmov.u8 d4, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -827,13 +827,13 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( - "vmov.u8 d0, #255 \n" // Alpha + "vmov.u8 d0, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 @@ -844,12 +844,12 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of // RGB24. - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -874,13 +874,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -920,13 +920,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -949,13 +949,13 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -969,11 +969,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of // RGB24. - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -985,11 +985,11 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 @@ -1001,10 +1001,10 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1016,10 +1016,10 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1034,11 +1034,11 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d1}, [%1]! \n" // store 8 U. - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1054,11 +1054,11 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d0}, [%1]! \n" // store 8 U. - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1074,16 +1074,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // stride + src_yuy2 + "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - "vst1.8 {d1}, [%2]! \n" // store 8 U. - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 "+r"(dst_u), // %2 @@ -1101,16 +1101,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // stride + src_uyvy + "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - "vst1.8 {d0}, [%2]! \n" // store 8 U. - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 "+r"(dst_u), // %2 @@ -1128,14 +1128,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, const uint8_t* shuffler, int width) { asm volatile( - "vld1.8 {q2}, [%3] \n" // shuffler + "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1151,12 +1151,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - "vld1.8 {d1}, [%1]! \n" // load 8 Us - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1173,12 +1173,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - "vld1.8 {d0}, [%1]! \n" // load 8 Us - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1193,11 +1193,11 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 @@ -1210,16 +1210,16 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "vdup.32 d2, %2 \n" // dither4 + "vdup.32 d2, %2 \n" // dither4 "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" // add for dither + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" // add for dither ARGBTORGB565 - "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. - "bgt 1b \n" + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 @@ -1232,11 +1232,11 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 @@ -1248,14 +1248,14 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "vmov.u8 d4, #0x0f \n" // bits to clear with + "vmov.u8 d4, #0x0f \n" // bits to clear with // vbic. "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 @@ -1265,20 +1265,20 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1291,11 +1291,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 @@ -1306,18 +1306,18 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1327,18 +1327,18 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d1, d24 \n" // B - "vmlal.u8 q2, d2, d25 \n" // G - "vmlal.u8 q2, d3, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d1, d24 \n" // B + "vmlal.u8 q2, d2, d25 \n" // G + "vmlal.u8 q2, d3, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1352,32 +1352,32 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 + "vmov.u8 d24, #112 \n" // UB / VR 0.875 // coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1409,34 +1409,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 @@ -1455,34 +1455,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 @@ -1500,34 +1500,34 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. - "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. - "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q3, q2, q1) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_stride_bgra), // %1 "+r"(dst_u), // %2 @@ -1545,34 +1545,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_stride_abgr), // %1 "+r"(dst_u), // %2 @@ -1590,34 +1590,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. - "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. - "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_stride_rgba), // %1 "+r"(dst_u), // %2 @@ -1635,34 +1635,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_stride_rgb24), // %1 "+r"(dst_u), // %2 @@ -1680,34 +1680,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(src_stride_raw), // %1 "+r"(dst_u), // %2 @@ -1726,55 +1726,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_stride_rgb565), // %1 "+r"(dst_u), // %2 @@ -1792,55 +1792,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_stride_argb1555), // %1 "+r"(dst_u), // %2 @@ -1858,46 +1858,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q0, q4, #1 \n" // 2x average - "vrshr.u16 q1, q5, #1 \n" - "vrshr.u16 q2, q6, #1 \n" + "vrshr.u16 q0, q4, #1 \n" // 2x average + "vrshr.u16 q1, q5, #1 \n" + "vrshr.u16 q2, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_stride_argb4444), // %1 "+r"(dst_u), // %2 @@ -1910,21 +1910,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1936,21 +1936,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1962,21 +1962,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1986,20 +1986,20 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2009,20 +2009,20 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2032,20 +2032,20 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2055,20 +2055,20 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2078,20 +2078,20 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2101,18 +2101,18 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { asm volatile( - "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // B - "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // R + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q4, d0, d4 \n" // B + "vmlal.u8 q4, d1, d5 \n" // G + "vmlal.u8 q4, d2, d6 \n" // R "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 @@ -2122,18 +2122,18 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { asm volatile( - "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // B - "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // R + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q4, d0, d4 \n" // B + "vmlal.u8 q4, d1, d5 \n" // G + "vmlal.u8 q4, d2, d6 \n" // R "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 @@ -2149,46 +2149,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int source_y_fraction) { int y1_fraction = source_y_fraction; asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -2206,51 +2206,51 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { asm volatile( - "subs %3, #8 \n" - "blt 89f \n" + "subs %3, #8 \n" + "blt 89f \n" // Blend 8 pixels. "8: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" + "adds %3, #8-1 \n" + "blt 99f \n" // Blend 1 pixels. "1: \n" - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" - - "99: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -2267,16 +2267,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, asm volatile( // Attenuate 8 pixels. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2292,32 +2292,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" "vqdmulh.s16 q0, q0, q8 \n" // b * scale "vqdmulh.s16 q1, q1, q8 \n" // g "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -2334,28 +2334,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 "vqrdmulh.s16 q11, q11, d0[1] \n" // g "vqrdmulh.s16 q12, q12, d0[2] \n" // r "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2368,20 +2368,20 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2395,32 +2395,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : @@ -2436,51 +2436,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - - "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2497,19 +2497,19 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2526,13 +2526,13 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2549,13 +2549,13 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2574,17 +2574,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // alpha + "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2601,12 +2601,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, asm volatile( // 16 pixel loop. "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -2625,15 +2625,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // alpha + "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2653,23 +2653,23 @@ void SobelXRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "vld1.8 {d0}, [%0],%5 \n" // top - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%2],%5 \n" // bottom - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -2691,23 +2691,23 @@ void SobelYRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "vld1.8 {d0}, [%0],%4 \n" // left - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%0],%5 \n" // right - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -2729,18 +2729,18 @@ void HalfFloat1Row_NEON(const uint16_t* src, asm volatile( "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2755,18 +2755,18 @@ void HalfFloatRow_NEON(const uint16_t* src, asm volatile( "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2781,17 +2781,17 @@ void ByteToFloatRow_NEON(const uint8_t* src, asm volatile( "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 bytes - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q1, d2 \n" // 8 shorts - "vmovl.u16 q2, d2 \n" // 8 ints - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // scale - "vmul.f32 q3, q3, %y3 \n" - "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats - "bgt 1b \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2808,26 +2808,26 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width) { asm volatile( - "vmov.u16 d6, #4 \n" // constant 4 - "vmov.u16 d7, #6 \n" // constant 6 - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows - "vld1.16 {q2}, [%4]! \n" - "vaddl.u16 q0, d2, d4 \n" // * 1 - "vaddl.u16 q1, d3, d5 \n" // * 1 - "vld1.16 {q2}, [%1]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "vld1.16 {q2}, [%2]! \n" - "vmlal.u16 q0, d4, d7 \n" // * 6 - "vmlal.u16 q1, d5, d7 \n" // * 6 - "vld1.16 {q2}, [%3]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "subs %6, %6, #8 \n" // 8 processed per loop - "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples - "bgt 1b \n" + "vmov.u16 d6, #4 \n" // constant 4 + "vmov.u16 d7, #6 \n" // constant 6 + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows + "vld1.16 {q2}, [%4]! \n" + "vaddl.u16 q0, d2, d4 \n" // * 1 + "vaddl.u16 q1, d3, d5 \n" // * 1 + "vld1.16 {q2}, [%1]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "vld1.16 {q2}, [%2]! \n" + "vmlal.u16 q0, d4, d7 \n" // * 6 + "vmlal.u16 q1, d5, d7 \n" // * 6 + "vld1.16 {q2}, [%3]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "subs %6, %6, #8 \n" // 8 processed per loop + "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples + "bgt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2845,8 +2845,8 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( - "vmov.u32 q10, #4 \n" // constant 4 - "vmov.u32 q11, #6 \n" // constant 6 + "vmov.u32 q10, #4 \n" // constant 4 + "vmov.u32 q11, #6 \n" // constant 6 "1: \n" "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples @@ -2884,16 +2884,16 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld1.8 {q2}, [%0]! \n" // load 16 Y values - "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values - "vmov d1, d0 \n" - "vzip.u8 d0, d1 \n" // VV - "vmov d3, d2 \n" - "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop - "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels - "vst3.8 {d1, d3, d5}, [%2]! \n" - "bgt 1b \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -2907,24 +2907,24 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_uv, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV + "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average "vqrshrun.s16 d0, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. - "bgt 1b \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_uv), // %2 @@ -2938,24 +2938,24 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, uint8_t* dst_vu, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV + "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. - "bgt 1b \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_vu), // %2 @@ -2969,11 +2969,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q2}, [%1]! \n" // store 16 Y's. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2985,12 +2985,12 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values - "vld2.8 {d1, d3}, [%0]! \n" - "vorr.u8 q2, q0, q0 \n" // move U after V - "subs %2, %2, #16 \n" // 16 pixels per loop - "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels - "bgt 1b \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values + "vld2.8 {d1, d3}, [%0]! \n" + "vorr.u8 q2, q0, q0 \n" // move U after V + "subs %2, %2, #16 \n" // 16 pixels per loop + "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -3008,19 +3008,19 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v_1 = src_v + src_stride_v; asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 U values - "vld1.8 {q1}, [%2]! \n" // load 16 V values - "vld1.8 {q2}, [%1]! \n" - "vld1.8 {q3}, [%3]! \n" - "vpaddl.u8 q0, q0 \n" // half size - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q1, q3 \n" + "vld1.8 {q0}, [%0]! \n" // load 16 U values + "vld1.8 {q1}, [%2]! \n" // load 16 V values + "vld1.8 {q2}, [%1]! \n" + "vld1.8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" // half size + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q1, q3 \n" "vqrshrn.u16 d0, q0, #2 \n" "vqrshrn.u16 d1, q1, #2 \n" - "subs %5, %5, #16 \n" // 16 src pixels per loop - "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels - "bgt 1b \n" + "subs %5, %5, #16 \n" // 16 src pixels per loop + "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels + "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_u_1), // %1 "+r"(src_v), // %2 diff --git a/chromium/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libyuv/source/row_neon64.cc index 06e9eea993c..d5258a3aef3 100644 --- a/chromium/third_party/libyuv/source/row_neon64.cc +++ b/chromium/third_party/libyuv/source/row_neon64.cc @@ -114,16 +114,16 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" + "movi v23.8b, #255 \n" /* A */ + "1: \n" READYUV444 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -146,17 +146,17 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ + "movi v23.8b, #255 \n" /* A */ - "1: \n" + "1: \n" READYUV422 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -180,17 +180,17 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READYUV422 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "ld1 {v23.8b}, [%3], #8 \n" - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "prfm pldl1keep, [%3, 448] \n" - "subs %w5, %w5, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" - "b.gt 1b \n" + "ld1 {v23.8b}, [%3], #8 \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "prfm pldl1keep, [%3, 448] \n" + "subs %w5, %w5, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -214,16 +214,16 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v20.8b, #255 \n" /* A */ - "1: \n" + "movi v20.8b, #255 \n" /* A */ + "1: \n" READYUV422 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v23, v22, v21) - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -246,15 +246,15 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READYUV422 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "subs %w4, %w4, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "subs %w4, %w4, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -286,16 +286,16 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" + "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #8 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #8 \n" ARGBTORGB565 - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -326,17 +326,17 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #8 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #8 \n" ARGBTOARGB1555 - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -369,18 +369,18 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" ARGBTOARGB4444 - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -401,14 +401,14 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READYUV400 YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -423,15 +423,15 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "movi v23.8b, #255 \n" + "movi v23.8b, #255 \n" "1: \n" - "ld1 {v20.8b}, [%0], #8 \n" - "prfm pldl1keep, [%0, 448] \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "ld1 {v20.8b}, [%0], #8 \n" + "prfm pldl1keep, [%0, 448] \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -446,15 +446,15 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READNV12 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -475,15 +475,15 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READNV21 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -504,14 +504,14 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READNV12 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb24), // %2 @@ -532,14 +532,14 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile ( YUVTORGB_SETUP - "1: \n" + "1: \n" READNV21 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_rgb24), // %2 @@ -560,12 +560,12 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP "1: \n" READNV12 - "prfm pldl1keep, [%0, 448] \n" YUVTORGB( + "prfm pldl1keep, [%0, 448] \n" YUVTORGB( v22, v21, v20) ARGBTORGB565 - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels - "b.gt 1b \n" + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 @@ -584,14 +584,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READYUY2 - "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -610,14 +610,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, int width) { asm volatile ( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" + "movi v23.8b, #255 \n" + "1: \n" READUYVY YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -637,12 +637,12 @@ void SplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "prfm pldl1keep, [%0, 448] \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop - "st1 {v0.16b}, [%1], #16 \n" // store U - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -659,13 +659,13 @@ void MergeUVRow_NEON(const uint8_t* src_u, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load U - "ld1 {v1.16b}, [%1], #16 \n" // load V - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -683,13 +683,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, int width) { asm volatile( "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop - "st1 {v0.16b}, [%1], #16 \n" // store R - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%3], #16 \n" // store B - "b.gt 1b \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -708,16 +708,16 @@ void MergeRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%2], #16 \n" // load B - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "prfm pldl1keep, [%0, 448] \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "prfm pldl1keep, [%0, 448] \n" + "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -732,11 +732,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" - "ldp q0, q1, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #32 \n" // 32 processed per loop - "stp q0, q1, [%1], #32 \n" - "b.gt 1b \n" + "ldp q0, q1, [%0], #32 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers @@ -748,11 +748,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( - "dup v0.16b, %w2 \n" // duplicate 16 bytes + "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 @@ -761,11 +761,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( - "dup v0.4s, %w2 \n" // duplicate 4 ints + "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 @@ -779,17 +779,17 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "ld1 {v3.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q2, [%0, 16] \n" - "ldr q1, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #32 \n" // 32 pixels per loop. - "tbl v0.16b, {v2.16b}, v3.16b \n" - "tbl v1.16b, {v1.16b}, v3.16b \n" - "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels - "b.gt 1b \n" + "ld1 {v3.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q2, [%0, 16] \n" + "ldr q1, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #32 \n" // 32 pixels per loop. + "tbl v0.16b, {v2.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -804,17 +804,17 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile( // Start at end of source row. - "ld1 {v4.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw #1 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 - "b.gt 1b \n" + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(width) // %2 @@ -828,20 +828,20 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( // Start at end of source row. - "ld1 {v4.16b}, [%4] \n" // shuffler - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w3, %w3, #16 \n" // 16 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "uzp1 v0.16b, v2.16b, v3.16b \n" // U - "uzp2 v1.16b, v2.16b, v3.16b \n" // V - "st1 {v0.16b}, [%1], #16 \n" // dst += 16 - "st1 {v1.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v4.16b}, [%4] \n" // shuffler + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w3, %w3, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "uzp1 v0.16b, v2.16b, v3.16b \n" // U + "uzp2 v1.16b, v2.16b, v3.16b \n" // V + "st1 {v0.16b}, [%1], #16 \n" // dst += 16 + "st1 {v1.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -857,17 +857,17 @@ static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( // Start at end of source row. - "ld1 {v4.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #8 \n" // 8 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 - "b.gt 1b \n" + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #8 \n" // 8 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -879,19 +879,19 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { asm volatile( - "ld1 {v3.16b}, [%4] \n" // shuffler - "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #48 \n" + "ld1 {v3.16b}, [%4] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #48 \n" "1: \n" - "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3\n" // src -= 48 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v0.16b, {v0.16b}, v3.16b \n" - "tbl v1.16b, {v1.16b}, v3.16b \n" - "tbl v2.16b, {v2.16b}, v3.16b \n" - "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 - "b.gt 1b \n" + "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v0.16b, {v0.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "tbl v2.16b, {v2.16b}, v3.16b \n" + "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -904,13 +904,14 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "movi v4.8b, #255 \n" // Alpha + "movi v4.8b, #255 \n" // Alpha "1: \n" - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of + // RGB24. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -921,15 +922,15 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "movi v5.8b, #255 \n" // Alpha + "movi v5.8b, #255 \n" // Alpha "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -940,15 +941,15 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( - "movi v0.8b, #255 \n" // Alpha + "movi v0.8b, #255 \n" // Alpha "1: \n" - "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v2.8b, v4.8b, v4.8b \n" // move g - "orr v1.8b, v5.8b, v5.8b \n" // move r - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r - "b.gt 1b \n" + "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v2.8b, v4.8b, v4.8b \n" // move g + "orr v1.8b, v5.8b, v5.8b \n" // move r + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 @@ -960,13 +961,13 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -992,14 +993,14 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // Alpha + "movi v3.8b, #255 \n" // Alpha "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1049,14 +1050,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // Alpha + "movi v3.8b, #255 \n" // Alpha "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1084,12 +1085,12 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1103,11 +1104,12 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24 - "b.gt 1b \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of + // RGB24 + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -1119,13 +1121,13 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "orr v5.8b, v1.8b, v1.8b \n" // mov b - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 @@ -1137,11 +1139,11 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1153,11 +1155,11 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1172,12 +1174,12 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "prfm pldl1keep, [%0, 448] \n" - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1193,12 +1195,12 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "prfm pldl1keep, [%0, 448] \n" - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1216,15 +1218,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 @@ -1244,15 +1246,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 @@ -1270,14 +1272,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, const uint8_t* shuffler, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #4 \n" // 4 processed per loop - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1293,14 +1295,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" - "orr v2.8b, v1.8b, v1.8b \n" - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "prfm pldl1keep, [%0, 448] \n" + "orr v2.8b, v1.8b, v1.8b \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1317,14 +1319,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" - "orr v3.8b, v2.8b, v2.8b \n" - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "prfm pldl1keep, [%0, 448] \n" + "orr v3.8b, v2.8b, v2.8b \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1339,12 +1341,13 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTORGB565 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 @@ -1357,16 +1360,17 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "dup v1.4s, %w2 \n" // dither4 - "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 @@ -1379,12 +1383,13 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - "b.gt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 @@ -1396,15 +1401,16 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "movi v4.16b, #0x0f \n" // bits to clear with + "movi v4.16b, #0x0f \n" // bits to clear with // vbic. "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + // pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - "b.gt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 @@ -1414,21 +1420,21 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1441,11 +1447,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 @@ -1456,19 +1462,19 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v6.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1478,19 +1484,19 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v1.8b, v4.8b \n" // B - "umlal v0.8h, v2.8b, v5.8b \n" // G - "umlal v0.8h, v3.8b, v6.8b \n" // R - "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v6.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v1.8b, v4.8b \n" // B + "umlal v0.8h, v2.8b, v5.8b \n" // G + "umlal v0.8h, v3.8b, v6.8b \n" // R + "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1504,33 +1510,33 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "movi v24.8b, #112 \n" // UB / VR 0.875 + "movi v24.8b, #112 \n" // UB / VR 0.875 // coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1574,28 +1580,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 @@ -1614,33 +1620,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 @@ -1660,27 +1666,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v3.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 @@ -1700,27 +1706,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v3.8h, #1 \n" // 2x average - "urshr v2.8h, v2.8h, #1 \n" - "urshr v1.8h, v1.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 @@ -1740,27 +1746,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 @@ -1780,27 +1786,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 @@ -1820,27 +1826,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "prfm pldl1keep, [%1, 448] \n" - "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v2.8h, v2.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v0.8h, v0.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "prfm pldl1keep, [%1, 448] \n" + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 @@ -1862,43 +1868,43 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, asm volatile( RGBTOUV_SETUP_REG "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%1, 448] \n" + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%1, 448] \n" RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_rgb565_1), // %1 "+r"(dst_u), // %2 @@ -1920,43 +1926,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, asm volatile( RGBTOUV_SETUP_REG "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%1, 448] \n" + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%1, 448] \n" RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 @@ -1978,43 +1984,43 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, asm volatile( RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%1, 448] \n" + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%1, 448] \n" ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 @@ -2030,22 +2036,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( - "movi v24.8b, #25 \n" // B * 0.1016 coefficient - "movi v25.8b, #129 \n" // G * 0.5078 coefficient - "movi v26.8b, #66 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2058,22 +2064,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2085,22 +2091,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "movi v24.8b, #25 \n" // B * 0.1016 coefficient - "movi v25.8b, #129 \n" // G * 0.5078 coefficient - "movi v26.8b, #66 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2110,21 +2116,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #66 \n" // R * 0.2578 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #25 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2134,21 +2140,21 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v6.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v4.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2158,21 +2164,21 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2182,21 +2188,21 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2206,21 +2212,21 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v6.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v4.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2230,19 +2236,19 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B - "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v4.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v6.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B + "umlal v0.8h, v1.8b, v5.8b \n" // G + "umlal v0.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 @@ -2252,19 +2258,19 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { asm volatile( - "movi v6.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v4.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B - "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "movi v6.8b, #29 \n" // B * 0.1140 coefficient + "movi v5.8b, #150 \n" // G * 0.5870 coefficient + "movi v4.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B + "umlal v0.8h, v1.8b, v5.8b \n" // G + "umlal v0.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 @@ -2282,49 +2288,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" // General purpose row blend. "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + "ld1 {v0.16b}, [%1], #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -2343,58 +2349,60 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, uint8_t* dst_argb, int width) { asm volatile( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" // Blend 8 pixels. "8: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels - "b.ge 8b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" // Blend 1 pixels. "1: \n" - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel + // ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel + // ARGB1. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" "99: \n" @@ -2414,17 +2422,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, asm volatile( // Attenuate 8 pixels. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2440,33 +2448,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "prfm pldl1keep, [%0, 448] \n" - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "prfm pldl1keep, [%0, 448] \n" + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -2483,29 +2491,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2518,21 +2526,21 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movi v24.8b, #29 \n" // B * 0.1140 coefficient - "movi v25.8b, #150 \n" // G * 0.5870 coefficient - "movi v26.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" + "movi v24.8b, #29 \n" // B * 0.1140 coefficient + "movi v25.8b, #150 \n" // G * 0.5870 coefficient + "movi v26.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2547,33 +2555,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : @@ -2589,52 +2597,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2652,21 +2660,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2683,17 +2691,17 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2710,17 +2718,17 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2739,19 +2747,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "orr v1.8b, v0.8b, v0.8b \n" - "orr v2.8b, v0.8b, v0.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2768,14 +2776,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, asm volatile( // 16 pixel loop. "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "uqadd v0.16b, v0.16b, v1.16b \n" // add - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -2794,17 +2802,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v1.8b, v0.8b, v2.8b \n" // add - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2824,26 +2832,26 @@ void SobelXRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "ld1 {v0.8b}, [%0],%5 \n" // top - "ld1 {v1.8b}, [%0],%6 \n" - "prfm pldl1keep, [%0, 448] \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - "ld1 {v3.8b}, [%1],%6 \n" - "prfm pldl1keep, [%1, 448] \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%2],%5 \n" // bottom - "ld1 {v3.8b}, [%2],%6 \n" - "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "prfm pldl1keep, [%0, 448] \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "prfm pldl1keep, [%1, 448] \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "prfm pldl1keep, [%2, 448] \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -2865,25 +2873,25 @@ void SobelYRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "ld1 {v0.8b}, [%0],%4 \n" // left - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%0],%5 \n" // right - "ld1 {v3.8b}, [%1],%5 \n" - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -2901,17 +2909,17 @@ void HalfFloat1Row_NEON(const uint16_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2925,19 +2933,19 @@ void HalfFloatRow_NEON(const uint16_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2951,18 +2959,18 @@ void ByteToFloatRow_NEON(const uint8_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v1.8h, v1.8b \n" // 8 shorts - "uxtl v2.4s, v1.4h \n" // 8 ints - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "fmul v3.4s, v3.4s, %3.s[0] \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2976,21 +2984,21 @@ float ScaleMaxSamples_NEON(const float* src, int width) { float fmax; asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "fmul v4.4s, v2.4s, %4.s[0] \n" // scale - "fmax v5.4s, v5.4s, v1.4s \n" // max - "fmax v6.4s, v6.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "fmax v5.4s, v5.4s, v6.4s \n" // max - "fmaxv %s3, v5.4s \n" // signed max acculator + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -3006,22 +3014,22 @@ float ScaleSumSamples_NEON(const float* src, int width) { float fsum; asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" // max - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "fmul v4.4s, v2.4s, %4.s[0] \n" - "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares - "fmla v6.4s, v2.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "faddp v5.4s, v5.4s, v6.4s \n" - "faddp v5.4s, v5.4s, v5.4s \n" - "faddp %3.4s, v5.4s, v5.4s \n" // sum + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -3034,13 +3042,13 @@ float ScaleSumSamples_NEON(const float* src, void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { asm volatile( "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %3.s[0] \n" // scale - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3057,31 +3065,31 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width) { asm volatile( - "movi v6.8h, #4 \n" // constant 4 - "movi v7.8h, #6 \n" // constant 6 - - "1: \n" - "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows - "ld1 {v2.8h}, [%4], #16 \n" - "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 - "prfm pldl1keep, [%0, 448] \n" - "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 - "ld1 {v2.8h}, [%1], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "prfm pldl1keep, [%1, 448] \n" - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "ld1 {v2.8h}, [%2], #16 \n" - "umlal v0.4s, v2.4h, v7.4h \n" // * 6 - "prfm pldl1keep, [%2, 448] \n" - "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 - "ld1 {v2.8h}, [%3], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "prfm pldl1keep, [%3, 448] \n" - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples - "prfm pldl1keep, [%4, 448] \n" - "b.gt 1b \n" + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "prfm pldl1keep, [%0, 448] \n" + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%1, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "prfm pldl1keep, [%2, 448] \n" + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%3, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "prfm pldl1keep, [%4, 448] \n" + "b.gt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -3099,28 +3107,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( - "movi v6.4s, #4 \n" // constant 4 - "movi v7.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples - "add v0.4s, v0.4s, v1.4s \n" // * 1 - "add v1.4s, v1.4s, v2.4s \n" // * 1 - "ld1 {v2.4s,v3.4s}, [%2], #32 \n" - "mla v0.4s, v2.4s, v7.4s \n" // * 6 - "mla v1.4s, v3.4s, v7.4s \n" // * 6 - "ld1 {v2.4s,v3.4s}, [%1], #32 \n" - "ld1 {v4.4s,v5.4s}, [%3], #32 \n" - "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 - "add v3.4s, v3.4s, v5.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "mla v0.4s, v2.4s, v6.4s \n" // * 4 - "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "subs %w5, %w5, #8 \n" // 8 processed per loop - "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack - "uqrshrn2 v0.8h, v1.4s, #8 \n" - "st1 {v0.8h}, [%4], #16 \n" // store 8 samples - "b.gt 1b \n" + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -3142,30 +3150,30 @@ void GaussCol_F32_NEON(const float* src0, float* dst, int width) { asm volatile( - "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 - - "1: \n" - "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows - "ld1 {v2.4s, v3.4s}, [%1], #32 \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "ld1 {v4.4s, v5.4s}, [%2], #32 \n" - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "fmla v0.4s, v4.4s, v7.4s \n" // * 6 - "ld1 {v2.4s, v3.4s}, [%3], #32 \n" - "fmla v1.4s, v5.4s, v7.4s \n" - "prfm pldl1keep, [%1, 448] \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "ld1 {v4.4s, v5.4s}, [%4], #32 \n" - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%2, 448] \n" - "fadd v0.4s, v0.4s, v4.4s \n" // * 1 - "prfm pldl1keep, [%3, 448] \n" - "fadd v1.4s, v1.4s, v5.4s \n" - "prfm pldl1keep, [%4, 448] \n" - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples - "b.gt 1b \n" + "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 + + "1: \n" + "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows + "ld1 {v2.4s, v3.4s}, [%1], #32 \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%2], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%3], #32 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "prfm pldl1keep, [%1, 448] \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%4], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%2, 448] \n" + "fadd v0.4s, v0.4s, v4.4s \n" // * 1 + "prfm pldl1keep, [%3, 448] \n" + "fadd v1.4s, v1.4s, v5.4s \n" + "prfm pldl1keep, [%4, 448] \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -3180,27 +3188,28 @@ void GaussCol_F32_NEON(const float* src0, // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_F32_NEON(const float* src, float* dst, int width) { asm volatile( - "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 - - "1: \n" - "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4\n" // load 12 samples, 5 rows - "fadd v0.4s, v0.4s, v1.4s \n" // * 1 - "ld1 {v4.4s, v5.4s}, [%0], %5 \n" - "fadd v1.4s, v1.4s, v2.4s \n" - "fmla v0.4s, v4.4s, v7.4s \n" // * 6 - "ld1 {v2.4s, v3.4s}, [%0], %4 \n" - "fmla v1.4s, v5.4s, v7.4s \n" - "ld1 {v4.4s, v5.4s}, [%0], %6 \n" - "fadd v2.4s, v2.4s, v4.4s \n" - "fadd v3.4s, v3.4s, v5.4s \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "fmul v0.4s, v0.4s, v8.4s \n" // / 256 - "fmul v1.4s, v1.4s, v8.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" + "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 + + "1: \n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 + // rows + "fadd v0.4s, v0.4s, v1.4s \n" // * 1 + "ld1 {v4.4s, v5.4s}, [%0], %5 \n" + "fadd v1.4s, v1.4s, v2.4s \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%0], %4 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "ld1 {v4.4s, v5.4s}, [%0], %6 \n" + "fadd v2.4s, v2.4s, v4.4s \n" + "fadd v3.4s, v3.4s, v5.4s \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmul v0.4s, v0.4s, v8.4s \n" // / 256 + "fmul v1.4s, v1.4s, v8.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3218,15 +3227,15 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -3243,19 +3252,19 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v2.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_uv), // %2 @@ -3272,19 +3281,19 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_vu), // %2 @@ -3297,11 +3306,11 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3316,16 +3325,16 @@ static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" - "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values - "ld1 {v1.16b}, [%0], 16 \n" - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "tbl v0.16b, {v0.16b}, v2.16b \n" - "tbl v1.16b, {v1.16b}, v2.16b \n" - "stp q0, q1, [%1], 32 \n" // store 16 VU pixels - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values + "ld1 {v1.16b}, [%0], 16 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "tbl v0.16b, {v0.16b}, v2.16b \n" + "tbl v1.16b, {v1.16b}, v2.16b \n" + "stp q0, q1, [%1], 32 \n" // store 16 VU pixels + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -3343,23 +3352,23 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v_1 = src_v + src_stride_v; asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values - "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values - "ld1 {v2.16b}, [%1], #16 \n" - "ld1 {v3.16b}, [%3], #16 \n" - "uaddlp v0.8h, v0.16b \n" // half size - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "uadalp v0.8h, v2.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v3.16b \n" - "prfm pldl1keep, [%3, 448] \n" - "uqrshrn v0.8b, v0.8h, #2 \n" - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w5, %w5, #16 \n" // 16 src pixels per loop - "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values + "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values + "ld1 {v2.16b}, [%1], #16 \n" + "ld1 {v3.16b}, [%3], #16 \n" + "uaddlp v0.8h, v0.16b \n" // half size + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v3.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w5, %w5, #16 \n" // 16 src pixels per loop + "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels + "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_u_1), // %1 "+r"(src_v), // %2 diff --git a/chromium/third_party/libyuv/source/scale_any.cc b/chromium/third_party/libyuv/source/scale_any.cc index b571aec964f..c93d70c5fc7 100644 --- a/chromium/third_party/libyuv/source/scale_any.cc +++ b/chromium/third_party/libyuv/source/scale_any.cc @@ -490,6 +490,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, 4, 1) #endif +#ifdef HAS_SCALEUVROWDOWNEVEN_NEON +SDAANY(ScaleUVRowDownEven_Any_NEON, + ScaleUVRowDownEven_NEON, + ScaleUVRowDownEven_C, + 2, + 3) +#endif #ifdef SASIMDONLY // This also works and uses memcpy and SIMD instead of C, but is slower on ARM diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc index fd4cbd03867..81959925c8a 100644 --- a/chromium/third_party/libyuv/source/scale_common.cc +++ b/chromium/third_party/libyuv/source/scale_common.cc @@ -1412,8 +1412,8 @@ enum FilterMode ScaleFilterReduce(int src_width, src_height = -src_height; } if (filtering == kFilterBox) { - // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. - if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { filtering = kFilterBilinear; } } diff --git a/chromium/third_party/libyuv/source/scale_gcc.cc b/chromium/third_party/libyuv/source/scale_gcc.cc index ef59c7e95ef..e575ee18bcb 100644 --- a/chromium/third_party/libyuv/source/scale_gcc.cc +++ b/chromium/third_party/libyuv/source/scale_gcc.cc @@ -102,16 +102,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, // 16 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -125,25 +125,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -156,33 +156,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -200,17 +200,17 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -225,26 +225,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -258,34 +258,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -301,24 +301,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -332,46 +332,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, int dst_width) { intptr_t stridex3; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea 0x00(%4,%4,2),%3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -387,26 +387,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -420,46 +420,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -476,9 +476,9 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" : : "m"(kShuf0), // %0 "m"(kShuf1), // %1 @@ -488,20 +488,20 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -514,18 +514,18 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 @@ -535,37 +535,37 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -580,18 +580,18 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 @@ -602,40 +602,40 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -651,23 +651,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -681,10 +681,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" : : "m"(kShufAb0), // %0 "m"(kShufAb1), // %1 @@ -695,25 +695,25 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -726,10 +726,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" : : "m"(kShufAc), // %0 "m"(kShufAc3), // %1 @@ -739,44 +739,44 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -791,25 +791,25 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, int src_width) { asm volatile( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -824,22 +824,22 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, int src_width) { asm volatile( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -866,69 +866,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, int dx) { intptr_t x0, x1, temp_pixel; asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" LABELALIGN "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -966,16 +966,16 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -993,14 +993,14 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -1017,17 +1017,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -1043,21 +1043,21 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -1076,23 +1076,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x12; (void)src_stride; asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" LABELALIGN "1: \n" - "movd (%0),%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%0,%1,2),%%xmm2 \n" - "movd 0x00(%0,%4,1),%%xmm3 \n" - "lea 0x00(%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 @@ -1113,32 +1113,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - "lea 0x00(%0,%5,1),%5 \n" + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movhps 0x00(%0,%1,1),%%xmm0 \n" - "movq 0x00(%0,%1,2),%%xmm1 \n" - "movhps 0x00(%0,%4,1),%%xmm1 \n" - "lea 0x00(%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps 0x00(%5,%1,1),%%xmm2 \n" - "movq 0x00(%5,%1,2),%%xmm3 \n" - "movhps 0x00(%5,%4,1),%%xmm3 \n" - "lea 0x00(%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 @@ -1156,56 +1156,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, int dx) { intptr_t x0, x1; asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" LABELALIGN "40: \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%3,%0,4),%%xmm1 \n" - "movd 0x00(%3,%1,4),%%xmm4 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd %%xmm0,(%2) \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" "99: \n" : "=&a"(x0), // %0 "=&d"(x1), // %1 @@ -1230,16 +1230,16 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -1267,63 +1267,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int dx) { intptr_t x0, x1; asm volatile( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 ); asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps 0x00(%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" LABELALIGN "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - - LABELALIGN "99: \n" // clang-format error. + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" + + LABELALIGN + "99: \n" // clang-format error. : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -1339,10 +1340,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int FixedDiv_X86(int num, int div) { asm volatile( "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); @@ -1353,13 +1354,13 @@ int FixedDiv_X86(int num, int div) { int FixedDiv1_X86(int num, int div) { asm volatile( "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); @@ -1379,30 +1380,30 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5, %%xmm5 \n" // zero - "movdqa %4,%%xmm1 \n" // split shuffler - "movdqa %5,%%xmm3 \n" // merge shuffler + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5, %%xmm5 \n" // zero + "movdqa %4,%%xmm1 \n" // split shuffler + "movdqa %5,%%xmm3 \n" // merge shuffler LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" // 8 UV row 0 - "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 - "lea 0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv - "pshufb %%xmm1,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add - "pmaddubsw %%xmm4,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" // vertical add - "psrlw $0x1,%%xmm0 \n" // round - "pavgw %%xmm5,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" // merge uv - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" // 4 UV - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" // 8 UV row 0 + "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 + "lea 0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv + "pshufb %%xmm1,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add + "pmaddubsw %%xmm4,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" // vertical add + "psrlw $0x1,%%xmm0 \n" // round + "pavgw %%xmm5,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" // merge uv + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" // 4 UV + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1419,31 +1420,31 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero "vbroadcastf128 %4,%%ymm1 \n" // split shuffler "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 - "lea 0x20(%0),%0 \n" - "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv - "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv - "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" // 8 UV - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 + "lea 0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv + "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" // 8 UV + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc index b626fc2987f..572b4bfa9b3 100644 --- a/chromium/third_party/libyuv/source/scale_neon.cc +++ b/chromium/third_party/libyuv/source/scale_neon.cc @@ -31,10 +31,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into q0, odd into q1 - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -51,11 +51,11 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -71,21 +71,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %1, %0 \n" "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + // row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and // pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [%3]! \n" - "vld1.8 {q2}, [%4]! \n" - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vld1.8 {q3}, [%3] \n" + "vld1.8 {q3}, [%3] \n" "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.8 {d4}, [%1]! \n" - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride * 2; asm volatile( - "vld1.16 {q13}, [%5] \n" - "vld1.8 {q14}, [%6] \n" - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" + "vmovn.u16 d4, q2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n @@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" + "vqrshrn.u16 d4, q2, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" // combine source lines - "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q3 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n @@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -511,13 +511,13 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, int src_width) { asm volatile( "1: \n" - "vld1.16 {q1, q2}, [%1] \n" // load accumulator - "vld1.8 {q0}, [%0]! \n" // load 16 bytes - "vaddw.u8 q2, q2, d1 \n" // add - "vaddw.u8 q1, q1, d0 \n" - "vst1.16 {q1, q2}, [%1]! \n" // store accumulator - "subs %2, %2, #16 \n" // 16 processed per loop - "bgt 1b \n" + "vld1.16 {q1, q2}, [%1] \n" // load accumulator + "vld1.8 {q0}, [%0]! \n" // load 16 bytes + "vaddw.u8 q2, q2, d1 \n" // add + "vaddw.u8 q1, q1, d0 \n" + "vst1.16 {q1, q2}, [%1]! \n" // store accumulator + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -547,17 +547,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, int* tmp = dx_offset; const uint8_t* src_tmp = src_ptr; asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q3, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q1, q1, q0 \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "vadd.s32 q2, q1, q3 \n" - "vshl.i32 q0, q3, #1 \n" // 8 * dx - "1: \n" + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -566,27 +566,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) - "vmov q10, q1 \n" - "vmov q11, q2 \n" - "vuzp.16 q10, q11 \n" - "vmovl.u8 q8, d6 \n" - "vmovl.u8 q9, d7 \n" - "vsubl.s16 q11, d18, d16 \n" - "vsubl.s16 q12, d19, d17 \n" - "vmovl.u16 q13, d20 \n" - "vmovl.u16 q10, d21 \n" - "vmul.s32 q11, q11, q13 \n" - "vmul.s32 q12, q12, q10 \n" - "vrshrn.s32 d18, q11, #16 \n" - "vrshrn.s32 d19, q12, #16 \n" - "vadd.s16 q8, q8, q9 \n" - "vmovn.s16 d6, q8 \n" - - "vst1.8 {d6}, [%0]! \n" // store pixels - "vadd.s32 q1, q1, q0 \n" - "vadd.s32 q2, q2, q0 \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vrshrn.s32 d18, q11, #16 \n" + "vrshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -609,75 +609,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" // Blend 25 / 75. "25: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" // Blend 75 / 25. "75: \n" - "vld1.8 {q1}, [%1]! \n" - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" "99: \n" - "vst1.8 {d1[7]}, [%0] \n" + "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -694,12 +694,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vmov q2, q1 \n" // load next 8 ARGB - "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -722,13 +722,13 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vrhadd.u8 q1, q2, q3 \n" // rounding half add - "vst2.32 {q0, q1}, [%1]! \n" - "bgt 1b \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -743,27 +743,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -781,15 +781,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, int dst_width) { (void)src_stride; asm volatile( - "mov r12, %3, lsl #2 \n" + "mov r12, %3, lsl #2 \n" "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -805,30 +805,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 @@ -865,8 +865,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, LOAD1_DATA32_LANE(d3, 1) // clang-format on "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -897,16 +897,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, int* tmp = dx_offset; const uint8_t* src_tmp = src_argb; asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q9, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - "vmov.i8 q3, #0x7f \n" // 0x7F - "vmov.i16 q15, #0x7f \n" // 0x7F + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q8, q1, q0 \n" - "1: \n" + "vadd.s32 q8, q1, q0 \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(d0, d2, 0) @@ -951,26 +951,26 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef LOAD2_DATA32_LANE void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. - "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. - "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV - "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV - "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vst2.8 {d0, d1}, [%2]! \n" - "bgt 1b \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. + "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV + "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV + "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vst2.8 {d0, d1}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -979,6 +979,35 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, : "memory", "cc", "q0", "q1", "q8", "q9"); } +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "vld1.16 {d0[0]}, [%0], %6 \n" + "vld1.16 {d0[1]}, [%1], %6 \n" + "vld1.16 {d0[2]}, [%2], %6 \n" + "vld1.16 {d0[3]}, [%3], %6 \n" + "subs %5, %5, #4 \n" // 4 pixels per loop. + "vst1.8 {d0}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"(src_stepx * 8) // %6 + : "memory", "cc", "d0"); +} + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libyuv/source/scale_neon64.cc index c45b7abecea..185591cb55b 100644 --- a/chromium/third_party/libyuv/source/scale_neon64.cc +++ b/chromium/third_party/libyuv/source/scale_neon64.cc @@ -29,11 +29,11 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -51,12 +51,12 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -72,21 +72,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddlp v1.8h, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -103,11 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -124,23 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - "ld1 {v1.16b}, [%2], #16 \n" - "ld1 {v2.16b}, [%3], #16 \n" - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v0.8h, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "uadalp v0.8h, v2.16b \n" - "prfm pldl1keep, [%3, 448] \n" - "uadalp v0.8h, v3.16b \n" - "prfm pldl1keep, [%4, 448] \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uadalp v0.8h, v3.16b \n" + "prfm pldl1keep, [%4, 448] \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_ptr1), // %2 @@ -160,13 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -179,51 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" - "prfm pldl1keep, [%3, 448] \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -238,35 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" - "prfm pldl1keep, [%3, 448] \n" + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -293,15 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "ld1 {v3.16b}, [%3] \n" - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v2.8b}, [%1], #8 \n" - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -318,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t tmp_src_stride = src_stride; asm volatile( - "ld1 {v29.8h}, [%5] \n" - "ld1 {v30.16b}, [%6] \n" - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -387,38 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // combine source lines - "add v0.8h, v0.8h, v16.8h \n" + "add v0.8h, v0.8h, v16.8h \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" - "prfm pldl1keep, [%3, 448] \n" + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" // Align for table lookup, vtbl requires registers to be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 @@ -440,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; asm volatile( - "ld1 {v30.8h}, [%4] \n" - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" + "uqrshrn v2.8b, v2.8h, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -496,35 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" // Align for table lookup, vtbl requires registers to // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 @@ -542,14 +542,14 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, int src_width) { asm volatile( "1: \n" - "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator - "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes - "uaddw2 v2.8h, v2.8h, v0.16b \n" // add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddw v1.8h, v1.8h, v0.8b \n" - "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator - "subs %w2, %w2, #16 \n" // 16 processed per loop - "b.gt 1b \n" + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -581,17 +581,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v3.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v1.4s, v1.4s, v0.4s \n" + "add v1.4s, v1.4s, v0.4s \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "add v2.4s, v1.4s, v3.4s \n" - "shl v0.4s, v3.4s, #1 \n" // 8 * dx - "1: \n" + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -600,27 +600,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) - "mov v6.16b, v1.16b \n" - "mov v7.16b, v2.16b \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "ushll v4.8h, v4.8b, #0 \n" - "ushll v5.8h, v5.8b, #0 \n" - "ssubl v16.4s, v5.4h, v4.4h \n" - "ssubl2 v17.4s, v5.8h, v4.8h \n" - "ushll v7.4s, v6.4h, #0 \n" - "ushll2 v6.4s, v6.8h, #0 \n" - "mul v16.4s, v16.4s, v7.4s \n" - "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" - "add v4.8h, v4.8h, v6.8h \n" - "xtn v4.8b, v4.8h \n" - - "st1 {v4.8b}, [%0], #8 \n" // store pixels - "add v1.4s, v1.4s, v0.4s \n" - "add v2.4s, v2.4s, v0.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -644,83 +644,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" - - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" // General purpose row blend. "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" // Blend 25 / 75. "25: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" // Blend 75 / 25. "75: \n" - "ld1 {v1.16b}, [%1], #16 \n" - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" - "st1 {v0.b}[15], [%0] \n" + "st1 {v0.b}[15], [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -739,12 +739,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "mov v2.16b, v3.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels - "b.gt 1b \n" + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -761,14 +761,14 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "urhadd v1.16b, v2.16b, v3.16b \n" - "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -783,27 +783,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -822,14 +822,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "1: \n" - "ld1 {v0.s}[0], [%0], %3 \n" - "ld1 {v0.s}[1], [%0], %3 \n" - "ld1 {v0.s}[2], [%0], %3 \n" - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -847,35 +847,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 - "ld1 {v1.8b}, [%1], %4 \n" - "ld1 {v2.8b}, [%0], %4 \n" - "ld1 {v3.8b}, [%1], %4 \n" - "ld1 {v4.8b}, [%0], %4 \n" - "ld1 {v5.8b}, [%1], %4 \n" - "ld1 {v6.8b}, [%0], %4 \n" - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "prfm pldl1keep, [%1, 448] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "prfm pldl1keep, [%1, 448] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 @@ -912,11 +912,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, LOAD1_DATA32_LANE(v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead // clang-format on - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -949,16 +949,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v6.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" - "movi v3.16b, #0x7f \n" // 0x7F - "movi v4.8h, #0x7f \n" // 0x7F + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v5.4s, v1.4s, v0.4s \n" - "1: \n" + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(v0, v1, 0) @@ -1010,21 +1010,21 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "1: \n" - "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #8 \n" // 8 processed per loop - "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent - "uaddlp v1.4s, v1.8h \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent - "uadalp v1.4s, v3.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "rshrn v0.4h, v0.4s, #2 \n" // round and pack - "rshrn2 v0.8h, v1.4s, #2 \n" - "st1 {v0.8h}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1041,40 +1041,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, uint16_t* dst, int dst_width) { asm volatile( - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "movi v0.8h, #9 \n" // constants - "movi v1.4s, #3 \n" + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" "1: \n" - "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 - "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 - "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row - "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 - "subs %w3, %w3, #16 \n" // 16 dst pixels per loop - "umull v16.4s, v3.4h, v0.4h \n" - "umull2 v7.4s, v3.8h, v0.8h \n" - "umull v18.4s, v4.4h, v0.4h \n" - "umull2 v17.4s, v4.8h, v0.8h \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddw v16.4s, v16.4s, v6.4h \n" - "uaddl2 v19.4s, v6.8h, v3.8h \n" - "uaddl v3.4s, v6.4h, v3.4h \n" - "uaddw2 v6.4s, v7.4s, v6.8h \n" - "uaddl2 v7.4s, v5.8h, v4.8h \n" - "uaddl v4.4s, v5.4h, v4.4h \n" - "uaddw v18.4s, v18.4s, v5.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "mla v16.4s, v4.4s, v1.4s \n" - "mla v18.4s, v3.4s, v1.4s \n" - "mla v6.4s, v7.4s, v1.4s \n" - "uaddw2 v4.4s, v17.4s, v5.8h \n" - "uqrshrn v16.4h, v16.4s, #4 \n" - "mla v4.4s, v19.4s, v1.4s \n" - "uqrshrn2 v16.8h, v6.4s, #4 \n" - "uqrshrn v17.4h, v18.4s, #4 \n" - "uqrshrn2 v17.8h, v4.4s, #4 \n" - "st2 {v16.8h-v17.8h}, [%2], #32 \n" - "b.gt 1b \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1092,21 +1092,21 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. - "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 - "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "prfm pldl1keep, [%1, 448] \n" - "rshrn v1.8b, v1.8h, #2 \n" - "st2 {v0.8b,v1.8b}, [%2], #16 \n" - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. + "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 + "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "prfm pldl1keep, [%1, 448] \n" + "rshrn v1.8b, v1.8h, #2 \n" + "st2 {v0.8b,v1.8b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1115,6 +1115,35 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, : "memory", "cc", "v0", "v1", "v16", "v17"); } +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.h}[0], [%0], %6 \n" + "ld1 {v1.h}[0], [%1], %6 \n" + "ld1 {v2.h}[0], [%2], %6 \n" + "ld1 {v3.h}[0], [%3], %6 \n" + "subs %w5, %w5, #4 \n" // 4 pixels per loop. + "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"((int64_t)(src_stepx * 8)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_uv.cc b/chromium/third_party/libyuv/source/scale_uv.cc index 4e276518aaf..c57df5959b9 100644 --- a/chromium/third_party/libyuv/source/scale_uv.cc +++ b/chromium/third_party/libyuv/source/scale_uv.cc @@ -299,6 +299,14 @@ static void ScaleUVDownEven(int src_width, } #endif #if defined(HAS_SCALEUVROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && !filtering) { + ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = ScaleUVRowDownEven_NEON; + } + } +#endif// TODO(fbarchard): Enable Box filter +#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON : ScaleUVRowDownEven_Any_NEON; @@ -484,7 +492,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width, 4)) { + if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_SSSE3; } } @@ -492,7 +500,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width, 8)) { + if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_AVX2; } } @@ -500,7 +508,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { + if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_NEON; } } @@ -508,7 +516,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(dst_width, 2)) { + if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_MMI; } } @@ -516,7 +524,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { + if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_MSA; } } @@ -532,7 +540,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_SCALEUVFILTERCOLS_NEON) if (filtering && TestCpuFlag(kCpuHasNEON)) { ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { + if (IS_ALIGNED(dst_width, 8)) { ScaleUVFilterCols = ScaleUVFilterCols_NEON; } } @@ -540,7 +548,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_SCALEUVFILTERCOLS_MSA) if (filtering && TestCpuFlag(kCpuHasMSA)) { ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { + if (IS_ALIGNED(dst_width, 16)) { ScaleUVFilterCols = ScaleUVFilterCols_MSA; } } @@ -553,7 +561,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_SCALEUVCOLS_NEON) if (!filtering && TestCpuFlag(kCpuHasNEON)) { ScaleUVFilterCols = ScaleUVCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { + if (IS_ALIGNED(dst_width, 16)) { ScaleUVFilterCols = ScaleUVCols_NEON; } } @@ -569,7 +577,7 @@ static void ScaleUVBilinearUp(int src_width, #if defined(HAS_SCALEUVCOLS_MSA) if (!filtering && TestCpuFlag(kCpuHasMSA)) { ScaleUVFilterCols = ScaleUVCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { + if (IS_ALIGNED(dst_width, 8)) { ScaleUVFilterCols = ScaleUVCols_MSA; } } @@ -836,7 +844,6 @@ static void ScaleUV(const uint8_t* src, dst_stride, src, dst, x, y, dy, 4, filtering); return; } - #if HAS_SCALEUVBILINEARUP if (filtering && dy < 65536) { ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc index 65aa46e0dae..e05ff15640c 100644 --- a/chromium/third_party/libyuv/unit_test/planar_test.cc +++ b/chromium/third_party/libyuv/unit_test/planar_test.cc @@ -3562,4 +3562,68 @@ TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_uv_c); } +TEST_F(LibYUVPlanarTest, NV12Copy) { + const int halfwidth = (benchmark_width_ + 1) >> 1; + const int halfheight = (benchmark_height_ + 1) >> 1; + align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_uv, halfwidth * 2 * halfheight); + align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight); + + MemRandomize(src_y, benchmark_width_ * benchmark_height_); + MemRandomize(src_uv, halfwidth * 2 * halfheight); + MemRandomize(dst_y, benchmark_width_ * benchmark_height_); + MemRandomize(dst_uv, halfwidth * 2 * halfheight); + + for (int i = 0; i < benchmark_iterations_; ++i) { + NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y, + benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_, + benchmark_height_); + } + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_EQ(src_y[i], dst_y[i]); + } + for (int i = 0; i < halfwidth * 2 * halfheight; ++i) { + EXPECT_EQ(src_uv[i], dst_uv[i]); + } + + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_uv); + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_uv); +} + +TEST_F(LibYUVPlanarTest, NV21Copy) { + const int halfwidth = (benchmark_width_ + 1) >> 1; + const int halfheight = (benchmark_height_ + 1) >> 1; + align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_vu, halfwidth * 2 * halfheight); + align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight); + + MemRandomize(src_y, benchmark_width_ * benchmark_height_); + MemRandomize(src_vu, halfwidth * 2 * halfheight); + MemRandomize(dst_y, benchmark_width_ * benchmark_height_); + MemRandomize(dst_vu, halfwidth * 2 * halfheight); + + for (int i = 0; i < benchmark_iterations_; ++i) { + NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y, + benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_, + benchmark_height_); + } + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_EQ(src_y[i], dst_y[i]); + } + for (int i = 0; i < halfwidth * 2 * halfheight; ++i) { + EXPECT_EQ(src_vu[i], dst_vu[i]); + } + + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_vu); + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_vu); +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc index 2fdf5f60341..c04a236a1a4 100644 --- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc @@ -312,6 +312,21 @@ TEST_SCALETO(ARGBScale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO +#define TEST_SCALESWAPXY1(name, filter, max_diff) \ + TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \ + int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } + +// Test scale with swapped width and height with all 3 filters. +TEST_SCALESWAPXY1(ARGBScale, None, 0) +TEST_SCALESWAPXY1(ARGBScale, Linear, 0) +TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0) +#undef TEST_SCALESWAPXY1 + // Scale with YUV conversion to ARGB and clipping. // TODO(fbarchard): Add fourcc support. All 4 ARGB formats is easy to support. LIBYUV_API diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc index aa36202e825..d5294110be1 100644 --- a/chromium/third_party/libyuv/unit_test/scale_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_test.cc @@ -771,6 +771,58 @@ TEST_SCALETO(Scale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO +#define TEST_SCALESWAPXY1(DISABLED_, name, filter, max_diff) \ + TEST_F(LibYUVScaleTest, I420##name##SwapXY_##filter) { \ + int diff = I420TestFilter(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) { \ + int diff = I444TestFilter(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) { \ + int diff = I420TestFilter_16(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) { \ + int diff = I444TestFilter_16(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) { \ + int diff = NV12TestFilter(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } + +// Test scale to a specified size with all 4 filters. +#ifdef ENABLE_SLOW_TESTS +TEST_SCALESWAPXY1(, Scale, None, 0) +TEST_SCALESWAPXY1(, Scale, Linear, 3) +TEST_SCALESWAPXY1(, Scale, Bilinear, 3) +TEST_SCALESWAPXY1(, Scale, Box, 3) +#else +TEST_SCALESWAPXY1(DISABLED_, Scale, None, 0) +TEST_SCALESWAPXY1(DISABLED_, Scale, Linear, 3) +TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3) +TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3) +#endif + +#undef TEST_SCALESWAPXY1 + #ifdef ENABLE_ROW_TESTS #ifdef HAS_SCALEROWDOWN2_SSSE3 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { @@ -1052,4 +1104,153 @@ TEST_FACTOR(3, 1, 3, 0) #undef TEST_FACTOR #undef SX #undef DX + +TEST_F(LibYUVScaleTest, PlaneTest3x) { + const int kSrcStride = 48; + const int kDstStride = 16; + const int kSize = kSrcStride * 3; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < 48 * 3; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_pixels, kDstStride); + + int iterations16 = + benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_; + for (int i = 0; i < iterations16; ++i) { + ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1, + kFilterBilinear); + } + + EXPECT_EQ(49, dest_pixels[0]); + + ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1, + kFilterNone); + + EXPECT_EQ(49, dest_pixels[0]); + + free_aligned_buffer_page_end(dest_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(LibYUVScaleTest, PlaneTest4x) { + const int kSrcStride = 64; + const int kDstStride = 16; + const int kSize = kSrcStride * 4; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < 64 * 4; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_pixels, kDstStride); + + int iterations16 = + benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_; + for (int i = 0; i < iterations16; ++i) { + ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, + kFilterBilinear); + } + + EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]); + + ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, + kFilterNone); + + EXPECT_EQ(130, dest_pixels[0]); // expect the 3rd pixel of the 3rd row + + free_aligned_buffer_page_end(dest_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +// Intent is to test 200x50 to 50x200 but width and height can be parameters. +TEST_F(LibYUVScaleTest, PlaneTestRotate_None) { + const int kSize = benchmark_width_ * benchmark_height_; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < kSize; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_opt_pixels, kSize); + align_buffer_page_end(dest_c_pixels, kSize); + + MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. + ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, + dest_c_pixels, benchmark_height_, benchmark_height_, + benchmark_width_, kFilterNone); + MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. + + for (int i = 0; i < benchmark_iterations_; ++i) { + ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, + benchmark_height_, dest_opt_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, kFilterNone); + } + + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); + } + + free_aligned_buffer_page_end(dest_c_pixels); + free_aligned_buffer_page_end(dest_opt_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) { + const int kSize = benchmark_width_ * benchmark_height_; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < kSize; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_opt_pixels, kSize); + align_buffer_page_end(dest_c_pixels, kSize); + + MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. + ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, + dest_c_pixels, benchmark_height_, benchmark_height_, + benchmark_width_, kFilterBilinear); + MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. + + for (int i = 0; i < benchmark_iterations_; ++i) { + ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, + benchmark_height_, dest_opt_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, kFilterBilinear); + } + + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); + } + + free_aligned_buffer_page_end(dest_c_pixels); + free_aligned_buffer_page_end(dest_opt_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + +// Intent is to test 200x50 to 50x200 but width and height can be parameters. +TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) { + const int kSize = benchmark_width_ * benchmark_height_; + align_buffer_page_end(orig_pixels, kSize); + for (int i = 0; i < kSize; ++i) { + orig_pixels[i] = i; + } + align_buffer_page_end(dest_opt_pixels, kSize); + align_buffer_page_end(dest_c_pixels, kSize); + + MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. + ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, + dest_c_pixels, benchmark_height_, benchmark_height_, + benchmark_width_, kFilterBox); + MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. + + for (int i = 0; i < benchmark_iterations_; ++i) { + ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, + benchmark_height_, dest_opt_pixels, benchmark_height_, + benchmark_height_, benchmark_width_, kFilterBox); + } + + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); + } + + free_aligned_buffer_page_end(dest_c_pixels); + free_aligned_buffer_page_end(dest_opt_pixels); + free_aligned_buffer_page_end(orig_pixels); +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/scale_uv_test.cc b/chromium/third_party/libyuv/unit_test/scale_uv_test.cc index b62bf3ad779..e45a25da4a5 100644 --- a/chromium/third_party/libyuv/unit_test/scale_uv_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_uv_test.cc @@ -176,6 +176,21 @@ TEST_SCALETO(UVScale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO +#define TEST_SCALESWAPXY1(name, filter, max_diff) \ + TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \ + int diff = \ + UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_, \ + benchmark_width_, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } + +// Test scale with swapped width and height with all 3 filters. +TEST_SCALESWAPXY1(UVScale, None, 0) +TEST_SCALESWAPXY1(UVScale, Linear, 0) +TEST_SCALESWAPXY1(UVScale, Bilinear, 0) +#undef TEST_SCALESWAPXY1 + TEST_F(LibYUVScaleTest, UVTest3x) { const int kSrcStride = 48 * 2; const int kDstStride = 16 * 2; diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk index c4307a431f9..b0a344ae06d 100644 --- a/chromium/third_party/libyuv/winarm.mk +++ b/chromium/third_party/libyuv/winarm.mk @@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \ source/scale_any.o\ source/scale_argb.o\ source/scale_common.o\ + source/scale_uv.o\ source/video_common.o .cc.o: |