diff options
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm')
-rw-r--r-- | chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm | 1282 |
1 files changed, 0 insertions, 1282 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm deleted file mode 100644 index 1cbbbcdef5e..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm +++ /dev/null @@ -1,1282 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_horizontal_edge_armv6| - EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| - EXPORT |vp8_loop_filter_vertical_edge_armv6| - EXPORT |vp8_mbloop_filter_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - -src RN r0 -pstep RN r1 -count RN r5 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_loop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Hnext8| - ; vp8_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - orr lr, lr, r10 - sub src, src, pstep, lsl #2 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq hskip_filter ; skip filtering - - sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines - - ;vp8_hevmask() function - ;calculate high edge variance - orr r10, r6, r8 ; calculate vp8_hevmask - - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 ; use usub8 instead of ssub8 - sel r6, r12, r11 ; obtain vp8_hevmask: r6 - - ;vp8_filter() function - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp8_filter (r7) &= hev - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - and r7, r7, lr ; vp8_filter &= mask; - - ;modify code for vp8 -- Filter1 = vp8_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) - qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: Filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) - qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) - - ;end of modification for vp8 - - mov lr, #0 - sadd8 r7, r7 , r10 ; vp8_filter += 1 - shadd8 r7, r7, lr ; vp8_filter >>= 1 - - ldr r11, [sp, #12] ; load ps1 - ldr r10, [sp, #8] ; load qs1 - - bic r7, r7, r6 ; vp8_filter &= ~hev - sub src, src, pstep, lsl #2 - - qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) - qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) - - eor r11, r11, r12 ; *op1 = u^0x80 - str r11, [src], pstep ; store op1 - eor r9, r9, r12 ; *op0 = u^0x80 - str r9, [src], pstep ; store op0 result - eor r8, r8, r12 ; *oq0 = u^0x80 - str r8, [src], pstep ; store oq0 result - eor r10, r10, r12 ; *oq1 = u^0x80 - str r10, [src], pstep ; store oq1 - - sub src, src, pstep, lsl #1 - -|hskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #2 - - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne Hnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_loop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBHnext8| - - ; vp8_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - - orr lr, lr, r10 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbhskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines - sub src, src, pstep, lsl #1 - - orr r10, r6, r8 - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 - sel r6, r12, r11 ; hev mask: r6 - - ;vp8_mbfilter() function - ;p2, q2 are only needed at the end. Don't need to load them in now. - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src] ; q1 - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp8_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - - and r7, r7, lr ; vp8_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - str r8, [src] ; store *oq0 - sub src, src, pstep - eor r10, r10, lr ; *op0 = s^0x80 - str r10, [src] ; store *op0 - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - - qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) - eor r11, r11, lr ; *op1 = s^0x80 - str r11, [src], pstep ; store *op1 - eor r8, r8, lr ; *oq1 = s^0x80 - add src, src, pstep, lsl #1 - - mov r7, #0x3f ; 63 - - str r8, [src], pstep ; store *oq1 - - ;roughly 1/7th difference across boundary - mov lr, #0x9 ; 9 - ldr r9, [src] ; load q2 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - sub src, src, pstep - ldr lr, c0x80808080 - - ldr r11, [src] ; load p2 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - eor r9, r9, lr - eor r11, r11, lr - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - str r8, [src], pstep, lsl #2 ; store *op2 - add src, src, pstep - eor r10, r10, lr ; *oq2 = s^0x80 - str r10, [src], pstep, lsl #1 ; store *oq2 - -|mbhskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #3 - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne MBHnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_loop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Vnext8| - - ; vp8_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq vskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - ;vp8_filter() function - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - str r6, [sp] - str lr, [sp, #4] - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to r7, r8, r9, r10 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp8_filter &= mask - - ;modify code for vp8 -- Filter1 = vp8_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) - qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) - qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) - ;end of modification for vp8 - - eor r8, r8, r12 - eor r9, r9, r12 - - mov lr, #0 - - sadd8 r7, r7, r10 - shadd8 r7, r7, lr - - ldr r10, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - bic r7, r7, r6 ; r7: vp8_filter - - qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) - qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) - eor r10, r10, r12 - eor r11, r11, r12 - - sub src, src, pstep, lsl #2 - - ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 - ;output is b0, b1, b2, b3 - ;b0: 03 02 01 00 - ;b1: 13 12 11 10 - ;b2: 23 22 21 20 - ;b3: 33 32 31 30 - ; p1 p0 q0 q1 - ; (a3 a2 a1 a0) - TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr - - strh r6, [src, #-2] ; store the result - mov r6, r6, lsr #16 - strh r6, [src], pstep - - strh r7, [src, #-2] - mov r7, r7, lsr #16 - strh r7, [src], pstep - - strh r12, [src, #-2] - mov r12, r12, lsr #16 - strh r12, [src], pstep - - strh lr, [src, #-2] - mov lr, lr, lsr #16 - strh lr, [src], pstep - -|vskip_filter| - sub src, src, #4 - subs count, count, #1 - - ldrne r6, [src], pstep ; load source data - ldrne r7, [src], pstep - ldrne r8, [src], pstep - ldrne lr, [src], pstep - - bne Vnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_loop_filter_vertical_edge_armv6| - - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - pld [src, #23] ; preload for next block - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - pld [src, #23] - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - pld [src, #23] - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - pld [src, #23] - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBVnext8| - ; vp8_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbvskip_filter ; skip filtering - - - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - - ; vp8_mbfilter() function - ; p2, q2 are only needed at the end. Don't need to load them in now. - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - str r6, [sp] ; save r6 - str lr, [sp, #4] ; save lr - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to p1, p0, q0, q1 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp8_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp8_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - eor r10, r10, lr ; *op0 = s^0x80 - - strb r10, [src, #-1] ; store op0 result - strb r8, [src], pstep ; store oq0 result - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - ldr lr, c0x80808080 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - add src, src, #2 - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) - qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) - eor r8, r8, lr ; *oq1 = s^0x80 - eor r10, r10, lr ; *op1 = s^0x80 - - ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary - strb r10, [src, #-4] ; store op1 - strb r8, [src, #-1] ; store oq1 - ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #8 - orr r9, r9, r7, lsl #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #16 - orr r9, r9, r7, lsl #16 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - orr r11, r11, r6, lsl #24 - orr r9, r9, r7, lsl #24 - - ;roughly 1/7th difference across boundary - eor r9, r9, lr - eor r11, r11, lr - - mov lr, #0x9 ; 9 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - ldr lr, c0x80808080 - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - eor r10, r10, lr ; *oq2 = s^0x80 - - strb r8, [src, #-5] ; store *op2 - strb r10, [src], pstep ; store *oq2 - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - - ;adjust src pointer for next loop - sub src, src, #2 - -|mbvskip_filter| - sub src, src, #4 - subs count, count, #1 - - pld [src, #23] ; preload for next block - ldrne r6, [src], pstep ; load source data - pld [src, #23] - ldrne r7, [src], pstep - pld [src, #23] - ldrne r8, [src], pstep - pld [src, #23] - ldrne lr, [src], pstep - - bne MBVnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 -c0x01010101 DCD 0x01010101 -c0x7F7F7F7F DCD 0x7F7F7F7F - - END |