diff options
Diffstat (limited to 'chromium/third_party/libyuv/source/scale_neon64.cc')
-rw-r--r-- | chromium/third_party/libyuv/source/scale_neon64.cc | 42 |
1 files changed, 15 insertions, 27 deletions
diff --git a/chromium/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libyuv/source/scale_neon64.cc index 494a9cfbfbe..f4aed5fc92f 100644 --- a/chromium/third_party/libyuv/source/scale_neon64.cc +++ b/chromium/third_party/libyuv/source/scale_neon64.cc @@ -515,37 +515,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, "v19", "v30", "v31", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int src_width, - int src_height) { - const uint8_t* src_tmp; +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { asm volatile( "1: \n" - "mov %0, %1 \n" - "mov w12, %w5 \n" - "eor v2.16b, v2.16b, v2.16b \n" - "eor v3.16b, v3.16b, v3.16b \n" - "2: \n" - // load 16 pixels into q0 - "ld1 {v0.16b}, [%0], %3 \n" - "uaddw2 v3.8h, v3.8h, v0.16b \n" - "uaddw v2.8h, v2.8h, v0.8b \n" - "subs w12, w12, #1 \n" - "b.gt 2b \n" - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels - "add %1, %1, #16 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop "b.gt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 : - : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + : "memory", "cc", "v0", "v1", "v2" // Clobber List ); } |