summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
committerJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
commit0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder
downloadlibvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz
Initial WebM releasev0.9.0
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/arm/armv6/walsh_v6.asm144
-rw-r--r--vp8/encoder/arm/boolhuff_arm.c33
-rw-r--r--vp8/encoder/arm/csystemdependent.c159
-rw-r--r--vp8/encoder/arm/dct_arm.h45
-rw-r--r--vp8/encoder/arm/encodemb_arm.c30
-rw-r--r--vp8/encoder/arm/encodemb_arm.h43
-rw-r--r--vp8/encoder/arm/mcomp_arm.c1662
-rw-r--r--vp8/encoder/arm/neon/boolhuff_armv7.asm292
-rw-r--r--vp8/encoder/arm/neon/fastfdct4x4_neon.asm126
-rw-r--r--vp8/encoder/arm/neon/fastfdct8x4_neon.asm179
-rw-r--r--vp8/encoder/arm/neon/fastquantizeb_neon.asm117
-rw-r--r--vp8/encoder/arm/neon/sad16_neon.asm206
-rw-r--r--vp8/encoder/arm/neon/sad8_neon.asm208
-rw-r--r--vp8/encoder/arm/neon/shortfdct_neon.asm146
-rw-r--r--vp8/encoder/arm/neon/subtract_neon.asm171
-rw-r--r--vp8/encoder/arm/neon/variance_neon.asm275
-rw-r--r--vp8/encoder/arm/neon/vp8_memcpy_neon.asm67
-rw-r--r--vp8/encoder/arm/neon/vp8_mse16x16_neon.asm172
-rw-r--r--vp8/encoder/arm/neon/vp8_packtokens_armv7.asm300
-rw-r--r--vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm335
-rw-r--r--vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm471
-rw-r--r--vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm75
-rw-r--r--vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm427
-rw-r--r--vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm571
-rw-r--r--vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm226
-rw-r--r--vp8/encoder/arm/picklpf_arm.c49
-rw-r--r--vp8/encoder/arm/quantize_arm.c79
-rw-r--r--vp8/encoder/arm/quantize_arm.h22
-rw-r--r--vp8/encoder/arm/variance_arm.h105
-rw-r--r--vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c77
-rw-r--r--vp8/encoder/bitstream.c1719
-rw-r--r--vp8/encoder/bitstream.h38
-rw-r--r--vp8/encoder/block.h115
-rw-r--r--vp8/encoder/boolhuff.c147
-rw-r--r--vp8/encoder/boolhuff.h42
-rw-r--r--vp8/encoder/dct.c223
-rw-r--r--vp8/encoder/dct.h65
-rw-r--r--vp8/encoder/encodeframe.c1223
-rw-r--r--vp8/encoder/encodeintra.c236
-rw-r--r--vp8/encoder/encodeintra.h24
-rw-r--r--vp8/encoder/encodemb.c1129
-rw-r--r--vp8/encoder/encodemb.h112
-rw-r--r--vp8/encoder/encodemv.c445
-rw-r--r--vp8/encoder/encodemv.h20
-rw-r--r--vp8/encoder/ethreading.c510
-rw-r--r--vp8/encoder/firstpass.c2512
-rw-r--r--vp8/encoder/firstpass.h22
-rw-r--r--vp8/encoder/generic/csystemdependent.c96
-rw-r--r--vp8/encoder/mcomp.c1467
-rw-r--r--vp8/encoder/mcomp.h121
-rw-r--r--vp8/encoder/modecosts.c46
-rw-r--r--vp8/encoder/modecosts.h16
-rw-r--r--vp8/encoder/onyx_if.c5428
-rw-r--r--vp8/encoder/onyx_int.h670
-rw-r--r--vp8/encoder/parms.cpp106
-rw-r--r--vp8/encoder/pickinter.c923
-rw-r--r--vp8/encoder/pickinter.h20
-rw-r--r--vp8/encoder/picklpf.c435
-rw-r--r--vp8/encoder/ppc/csystemdependent.c168
-rw-r--r--vp8/encoder/ppc/encodemb_altivec.asm152
-rw-r--r--vp8/encoder/ppc/fdct_altivec.asm204
-rw-r--r--vp8/encoder/ppc/rdopt_altivec.asm50
-rw-r--r--vp8/encoder/ppc/sad_altivec.asm276
-rw-r--r--vp8/encoder/ppc/variance_altivec.asm374
-rw-r--r--vp8/encoder/ppc/variance_subpixel_altivec.asm864
-rw-r--r--vp8/encoder/preproc.c250
-rw-r--r--vp8/encoder/psnr.c116
-rw-r--r--vp8/encoder/psnr.h17
-rw-r--r--vp8/encoder/quantize.c249
-rw-r--r--vp8/encoder/quantize.h52
-rw-r--r--vp8/encoder/ratectrl.c1552
-rw-r--r--vp8/encoder/ratectrl.h26
-rw-r--r--vp8/encoder/rdopt.c2212
-rw-r--r--vp8/encoder/rdopt.h20
-rw-r--r--vp8/encoder/sad_c.c248
-rw-r--r--vp8/encoder/ssim.c521
-rw-r--r--vp8/encoder/tokenize.c636
-rw-r--r--vp8/encoder/tokenize.h38
-rw-r--r--vp8/encoder/treewriter.c38
-rw-r--r--vp8/encoder/treewriter.h121
-rw-r--r--vp8/encoder/variance.h327
-rw-r--r--vp8/encoder/variance_c.c527
-rw-r--r--vp8/encoder/x86/csystemdependent.c289
-rw-r--r--vp8/encoder/x86/dct_mmx.asm846
-rw-r--r--vp8/encoder/x86/dct_sse2.asm260
-rw-r--r--vp8/encoder/x86/dct_x86.h73
-rw-r--r--vp8/encoder/x86/encodemb_x86.h73
-rw-r--r--vp8/encoder/x86/encodeopt.asm393
-rw-r--r--vp8/encoder/x86/fwalsh_sse2.asm117
-rw-r--r--vp8/encoder/x86/mcomp_x86.h27
-rw-r--r--vp8/encoder/x86/preproc_mmx.c297
-rw-r--r--vp8/encoder/x86/quantize_mmx.asm438
-rw-r--r--vp8/encoder/x86/sad_mmx.asm428
-rw-r--r--vp8/encoder/x86/sad_sse2.asm329
-rw-r--r--vp8/encoder/x86/sad_sse3.asm939
-rw-r--r--vp8/encoder/x86/sad_ssse3.asm367
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm431
-rw-r--r--vp8/encoder/x86/variance_impl_mmx.asm980
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm975
-rw-r--r--vp8/encoder/x86/variance_mmx.c596
-rw-r--r--vp8/encoder/x86/variance_sse2.c514
-rw-r--r--vp8/encoder/x86/variance_x86.h275
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c287
103 files changed, 42694 insertions, 0 deletions
diff --git a/vp8/encoder/arm/armv6/walsh_v6.asm b/vp8/encoder/arm/armv6/walsh_v6.asm
new file mode 100644
index 000000000..608c9ae65
--- /dev/null
+++ b/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -0,0 +1,144 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_walsh4x4_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+|vp8_short_walsh4x4_armv6| PROC
+
+ stmdb sp!, {r4 - r11, lr}
+
+ mov r12, r2 ; ugh. not clean
+ ldr r2, [r0] ; [1 | 0]
+ ldr r3, [r0, #4] ; [3 | 2]
+ ldr r4, [r0, r12]! ; [5 | 4]
+ ldr r5, [r0, #4] ; [7 | 6]
+ ldr r6, [r0, r12]! ; [9 | 8]
+ ldr r7, [r0, #4] ; [11 | 10]
+ ldr r8, [r0, r12]! ; [13 | 12]
+ ldr r9, [r0, #4] ; [15 | 14]
+
+ qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
+ qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
+ qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
+ qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
+
+ qaddsubx r2, r10, r11 ; [1 | 2] [c1+d1 | a1-b1]
+ qaddsubx r3, r11, r10 ; [0 | 3] [b1+a1 | d1-c1]
+ qaddsubx r4, r12, lr ; [5 | 6] [c1+d1 | a1-b1]
+ qaddsubx r5, lr, r12 ; [4 | 7] [b1+a1 | d1-c1]
+
+ qsubaddx r10, r6, r7 ; [c1|a1] [9-10 | 8+11]
+ qaddsubx r11, r6, r7 ; [b1|d1] [9+10 | 8-11]
+ qsubaddx r12, r8, r9 ; [c1|a1] [13-14 | 12+15]
+ qaddsubx lr, r8, r9 ; [b1|d1] [13+14 | 12-15]
+
+ qaddsubx r6, r10, r11 ; [9 |10] [c1+d1 | a1-b1]
+ qaddsubx r7, r11, r10 ; [8 |11] [b1+a1 | d1-c1]
+ qaddsubx r8, r12, lr ; [13|14] [c1+d1 | a1-b1]
+ qaddsubx r9, lr, r12 ; [12|15] [b1+a1 | d1-c1]
+
+ ; first transform complete
+
+ qadd16 r10, r3, r9 ; a1 [0+12 | 3+15]
+ qadd16 r11, r5, r7 ; b1 [4+8 | 7+11]
+ qsub16 r12, r5, r7 ; c1 [4-8 | 7-11]
+ qsub16 lr, r3, r9 ; d1 [0-12 | 3-15]
+
+ qadd16 r3, r10, r11 ; a2 [a1+b1] [0 | 3]
+ qadd16 r5, r12, lr ; b2 [c1+d1] [4 | 7]
+ qsub16 r7, r10, r11 ; c2 [a1-b1] [8 |11]
+ qsub16 r9, lr, r12 ; d2 [d1-c1] [12|15]
+
+ qadd16 r10, r2, r8 ; a1 [1+13 | 2+14]
+ qadd16 r11, r4, r6 ; b1 [5+9 | 6+10]
+ qsub16 r12, r4, r6 ; c1 [5-9 | 6-10]
+ qsub16 lr, r2, r8 ; d1 [1-13 | 2-14]
+
+ qadd16 r2, r10, r11 ; a2 [a1+b1] [1 | 2]
+ qadd16 r4, r12, lr ; b2 [c1+d1] [5 | 6]
+ qsub16 r6, r10, r11 ; c2 [a1-b1] [9 |10]
+ qsub16 r8, lr, r12 ; d2 [d1-c1] [13|14]
+
+ ; [a-d]2 += ([a-d]2 > 0)
+
+ asrs r10, r3, #16
+ addpl r10, r10, #1 ; [~0]
+ asrs r11, r2, #16
+ addpl r11, r11, #1 ; [~1]
+ lsl r11, r11, #15 ; [1 | x]
+ pkhtb r10, r11, r10, asr #1; [1 | 0]
+ str r10, [r1], #4
+
+ lsls r11, r2, #16
+ addpl r11, r11, #0x10000 ; [~2]
+ lsls r12, r3, #16
+ addpl r12, r12, #0x10000 ; [~3]
+ asr r12, r12, #1 ; [3 | x]
+ pkhtb r11, r12, r11, asr #17; [3 | 2]
+ str r11, [r1], #4
+
+ asrs r2, r5, #16
+ addpl r2, r2, #1 ; [~4]
+ asrs r3, r4, #16
+ addpl r3, r3, #1 ; [~5]
+ lsl r3, r3, #15 ; [5 | x]
+ pkhtb r2, r3, r2, asr #1 ; [5 | 4]
+ str r2, [r1], #4
+
+ lsls r2, r4, #16
+ addpl r2, r2, #0x10000 ; [~6]
+ lsls r3, r5, #16
+ addpl r3, r3, #0x10000 ; [~7]
+ asr r3, r3, #1 ; [7 | x]
+ pkhtb r2, r3, r2, asr #17 ; [7 | 6]
+ str r2, [r1], #4
+
+ asrs r2, r7, #16
+ addpl r2, r2, #1 ; [~8]
+ asrs r3, r6, #16
+ addpl r3, r3, #1 ; [~9]
+ lsl r3, r3, #15 ; [9 | x]
+ pkhtb r2, r3, r2, asr #1 ; [9 | 8]
+ str r2, [r1], #4
+
+ lsls r2, r6, #16
+ addpl r2, r2, #0x10000 ; [~10]
+ lsls r3, r7, #16
+ addpl r3, r3, #0x10000 ; [~11]
+ asr r3, r3, #1 ; [11 | x]
+ pkhtb r2, r3, r2, asr #17 ; [11 | 10]
+ str r2, [r1], #4
+
+ asrs r2, r9, #16
+ addpl r2, r2, #1 ; [~12]
+ asrs r3, r8, #16
+ addpl r3, r3, #1 ; [~13]
+ lsl r3, r3, #15 ; [13 | x]
+ pkhtb r2, r3, r2, asr #1 ; [13 | 12]
+ str r2, [r1], #4
+
+ lsls r2, r8, #16
+ addpl r2, r2, #0x10000 ; [~14]
+ lsls r3, r9, #16
+ addpl r3, r3, #0x10000 ; [~15]
+ asr r3, r3, #1 ; [15 | x]
+ pkhtb r2, r3, r2, asr #17 ; [15 | 14]
+ str r2, [r1]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_short_walsh4x4_armv6|
+
+ END
diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c
new file mode 100644
index 000000000..e70b3ad47
--- /dev/null
+++ b/vp8/encoder/arm/boolhuff_arm.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+#include "blockd.h"
+
+const unsigned int vp8_prob_cost[256] =
+{
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+ 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
+ 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
+ 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
+ 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
+ 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
+ 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
+ 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
+ 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
+ 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
+ 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
+};
+
diff --git a/vp8/encoder/arm/csystemdependent.c b/vp8/encoder/arm/csystemdependent.c
new file mode 100644
index 000000000..003979680
--- /dev/null
+++ b/vp8/encoder/arm/csystemdependent.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp8_cmachine_specific_config(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+ cpi->rtcd.common = &cpi->common.rtcd;
+
+#if HAVE_ARMV7
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_neon;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_neon;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_neon;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_neon;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_neon;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_neon;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_neon;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_neon;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_neon;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_neon;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_neon;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_neon;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_neon;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_c;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;;
+ cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_neon;
+
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_neon;
+ cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_neon;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_neon;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_neon;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_c;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_neon;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon;
+
+ cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
+#elif HAVE_ARMV6
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_c;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_c;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;;
+ cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;
+
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_c;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_c;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_c;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;
+
+ cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;
+#else
+ //pure c
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_c;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_c;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;;
+ cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;
+
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_c;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_c;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_c;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;
+
+ cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;
+#endif
+#endif
+
+#if HAVE_ARMV7
+ vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
+#else
+ vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
+#endif
+}
diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h
new file mode 100644
index 000000000..a671862fb
--- /dev/null
+++ b/vp8/encoder/arm/dct_arm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef DCT_ARM_H
+#define DCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_fdct(vp8_short_walsh4x4_armv6);
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_fdct(vp8_short_fdct4x4_neon);
+extern prototype_fdct(vp8_short_fdct8x4_neon);
+extern prototype_fdct(vp8_fast_fdct4x4_neon);
+extern prototype_fdct(vp8_fast_fdct8x4_neon);
+extern prototype_fdct(vp8_short_walsh4x4_neon);
+
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_neon
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
+
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
+
+#endif
+
+#endif
diff --git a/vp8/encoder/arm/encodemb_arm.c b/vp8/encoder/arm/encodemb_arm.c
new file mode 100644
index 000000000..3f1d05391
--- /dev/null
+++ b/vp8/encoder/arm/encodemb_arm.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "encodemb.h"
+#include "reconinter.h"
+#include "quantize.h"
+#include "invtrans.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "dct.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
+
+void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *src_ptr = (*(be->base_src) + be->src);
+ short *diff_ptr = be->src_diff;
+ unsigned char *pred_ptr = bd->predictor;
+ int src_stride = be->src_stride;
+
+ vp8_subtract_b_neon_func(diff_ptr, src_ptr, pred_ptr, src_stride, pitch);
+}
diff --git a/vp8/encoder/arm/encodemb_arm.h b/vp8/encoder/arm/encodemb_arm.h
new file mode 100644
index 000000000..28f9e5c5f
--- /dev/null
+++ b/vp8/encoder/arm/encodemb_arm.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_ARM_H
+#define ENCODEMB_ARM_H
+
+#if HAVE_ARMV7
+//extern prototype_berr(vp8_block_error_c);
+//extern prototype_mberr(vp8_mbblock_error_c);
+//extern prototype_mbuverr(vp8_mbuverror_c);
+
+extern prototype_subb(vp8_subtract_b_neon);
+extern prototype_submby(vp8_subtract_mby_neon);
+extern prototype_submbuv(vp8_subtract_mbuv_neon);
+
+//#undef vp8_encodemb_berr
+//#define vp8_encodemb_berr vp8_block_error_c
+
+//#undef vp8_encodemb_mberr
+//#define vp8_encodemb_mberr vp8_mbblock_error_c
+
+//#undef vp8_encodemb_mbuverr
+//#define vp8_encodemb_mbuverr vp8_mbuverror_c
+
+#undef vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_neon
+
+#undef vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_neon
+
+#undef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_neon
+
+#endif
+
+#endif
diff --git a/vp8/encoder/arm/mcomp_arm.c b/vp8/encoder/arm/mcomp_arm.c
new file mode 100644
index 000000000..07f218605
--- /dev/null
+++ b/vp8/encoder/arm/mcomp_arm.c
@@ -0,0 +1,1662 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+static int mv_bits_sadcost[256];
+
+extern unsigned int vp8_sub_pixel_variance16x16s_neon
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+);
+extern unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+);
+extern unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+);
+extern unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+);
+
+void vp8cx_init_mv_bits_sadcost()
+{
+ int i;
+
+ for (i = 0; i < 256; i++)
+ {
+ mv_bits_sadcost[i] = (int)sqrt(i * 16);
+ }
+}
+
+
+int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
+{
+ // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
+ // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
+ // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
+ // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
+ return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
+}
+
+int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
+{
+ //int i;
+ //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
+ //return ( (vp8_mv_bit_cost(mv, ref, mvcost, 100) + 128) * error_per_bit) >> 8;
+
+ //i = (vp8_mv_bit_cost(mv, ref, mvcost, 100) * error_per_bit + 128) >> 8;
+ return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8;
+ //return (vp8_mv_bit_cost(mv, ref, mvcost, 128) * error_per_bit + 128) >> 8;
+}
+
+
+static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
+{
+ // get the estimated number of bits for a motion vector, to be used for costing in SAD based
+ // motion estimation
+ return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+ int Len;
+ int search_site_count = 0;
+
+
+ // Generate offsets for 4 search sites per step.
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0)
+ {
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ // Contract.
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+ int Len;
+ int search_site_count = 0;
+
+ // Generate offsets for 8 search sites per step.
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0)
+ {
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride - Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride + Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride - Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride + Len;
+ search_site_count++;
+
+
+ // Contract.
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 8;
+}
+
+
+#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
+#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
+#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+ unsigned char *z = (*(b->base_src) + b->src);
+
+ int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1;
+ int br = bestmv->row << 2, bc = bestmv->col << 2;
+ int tr = br, tc = bc;
+ unsigned int besterr = INT_MAX;
+ unsigned int left, right, up, down, diag;
+ unsigned int sse;
+ unsigned int whichdir;
+ unsigned int halfiters = 4;
+ unsigned int quarteriters = 4;
+
+ int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
+ int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
+ int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1));
+ int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1));
+
+ // central mv
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+
+ // calculate central point error
+ besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
+ besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+ while (--halfiters)
+ {
+ // 1/2 pel
+ CHECK_BETTER(left, tr, tc - 2);
+ CHECK_BETTER(right, tr, tc + 2);
+ CHECK_BETTER(up, tr - 2, tc);
+ CHECK_BETTER(down, tr + 2, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir)
+ {
+ case 0:
+ CHECK_BETTER(diag, tr - 2, tc - 2);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - 2, tc + 2);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + 2, tc - 2);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + 2, tc + 2);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+ // 1/4 pel
+ while (--quarteriters)
+ {
+ CHECK_BETTER(left, tr, tc - 1);
+ CHECK_BETTER(right, tr, tc + 1);
+ CHECK_BETTER(up, tr - 1, tc);
+ CHECK_BETTER(down, tr + 1, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir)
+ {
+ case 0:
+ CHECK_BETTER(diag, tr - 1, tc - 1);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - 1, tc + 1);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + 1, tc - 1);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + 1, tc + 1);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ bestmv->row = br << 1;
+ bestmv->col = bc << 1;
+
+ if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL))
+ return INT_MAX;
+
+ return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+#undef MIN
+#undef MAX
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ int bestmse = INT_MAX;
+ MV startmv;
+ //MV this_mv;
+ MV this_mv;
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+ int whichdir ;
+
+
+ // Trap uncodable vectors
+ if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+ {
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ return INT_MAX;
+ }
+
+ // central mv
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ startmv = *bestmv;
+
+ // calculate central point error
+ bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+ bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ // go left then right and check error
+ this_mv.row = startmv.row;
+ this_mv.col = ((startmv.col - 8) | 4);
+ left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+ left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ }
+
+ this_mv.col += 8;
+ right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
+ right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ }
+
+ // go up then down and check error
+ this_mv.col = startmv.col;
+ this_mv.row = ((startmv.row - 8) | 4);
+ up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+ up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ }
+
+ this_mv.row += 8;
+ down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+ down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ }
+
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ //for(whichdir =0;whichdir<4;whichdir++)
+ //{
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.col += 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row += 4;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+ break;
+ case 3:
+ this_mv.col += 4;
+ this_mv.row += 4;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+// }
+
+
+ // time to check quarter pels.
+ if (bestmv->row < startmv.row)
+ y -= d->pre_stride;
+
+ if (bestmv->col < startmv.col)
+ y--;
+
+ startmv = *bestmv;
+
+
+
+ // go left then right and check error
+ this_mv.row = startmv.row;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col = startmv.col - 2;
+ left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+
+ left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ }
+
+ this_mv.col += 4;
+ right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ }
+
+ // go up then down and check error
+ this_mv.col = startmv.col;
+
+ if (startmv.row & 7)
+ {
+ this_mv.row = startmv.row - 2;
+ up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.row = (startmv.row - 8) | 6;
+ up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+
+ up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ }
+
+ this_mv.row += 4;
+ down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ }
+
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+// for(whichdir=0;whichdir<4;whichdir++)
+// {
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+
+ if (startmv.row & 7)
+ {
+ this_mv.row -= 2;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col -= 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+ }
+ }
+ else
+ {
+ this_mv.row = (startmv.row - 8) | 6;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col -= 2;
+ diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+ }
+ }
+
+ break;
+ case 1:
+ this_mv.col += 2;
+
+ if (startmv.row & 7)
+ {
+ this_mv.row -= 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.row = (startmv.row - 8) | 6;
+ diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+
+ break;
+ case 2:
+ this_mv.row += 2;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col -= 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+ }
+
+ break;
+ case 3:
+ this_mv.col += 2;
+ this_mv.row += 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+// }
+
+ return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ int bestmse = INT_MAX;
+ MV startmv;
+ //MV this_mv;
+ MV this_mv;
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+
+ // Trap uncodable vectors
+ if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+ {
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ return INT_MAX;
+ }
+
+ // central mv
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ startmv = *bestmv;
+
+ // calculate central point error
+ bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+ bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ // go left then right and check error
+ this_mv.row = startmv.row;
+ this_mv.col = ((startmv.col - 8) | 4);
+ left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+ left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ }
+
+ this_mv.col += 8;
+ right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
+ right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ }
+
+ // go up then down and check error
+ this_mv.col = startmv.col;
+ this_mv.row = ((startmv.row - 8) | 4);
+ up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+ up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ }
+
+ this_mv.row += 8;
+ down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+ down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ }
+
+ // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
+#if 0
+ // now check 1 more diagonal -
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.col += 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row += 4;
+ diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 3:
+ this_mv.col += 4;
+ this_mv.row += 4;
+ diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+#else
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+ this_mv.col += 8;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = startmv.row + 4;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+ this_mv.col += 8;
+ diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+#endif
+ return bestmse;
+}
+
+#if 1
+
+#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
+#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
+const MV next_chkpts[6][3] =
+{
+ {{ -2, 0}, { -1, -2}, {1, -2}},
+ {{ -1, -2}, {1, -2}, {2, 0}},
+ {{1, -2}, {2, 0}, {1, 2}},
+ {{2, 0}, {1, 2}, { -1, 2}},
+ {{1, 2}, { -1, 2}, { -2, 0}},
+ {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+int vp8_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_t vf,
+ vp8_sad_fn_t sf,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
+ MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+ int i, j;
+ unsigned char *src = (*(b->base_src) + b->src);
+ int src_stride = b->src_stride;
+ int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+ unsigned int besterr, thiserr = 0x7fffffff;
+ int k = -1, tk;
+
+ if (bc < x->mv_col_min) bc = x->mv_col_min;
+
+ if (bc > x->mv_col_max) bc = x->mv_col_max;
+
+ if (br < x->mv_row_min) br = x->mv_row_min;
+
+ if (br > x->mv_row_max) br = x->mv_row_max;
+
+ rr >>= 1;
+ rc >>= 1;
+
+ besterr = ERR(br, bc, thiserr);
+
+ // hex search
+ //j=0
+ tr = br;
+ tc = bc;
+
+ for (i = 0; i < 6; i++)
+ {
+ int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ //CHECK_BETTER(thiserr,nr,nc);
+ if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+ {
+ besterr = thiserr;
+ br = nr;
+ bc = nc;
+ k = i;
+ }
+ }
+
+ if (tr == br && tc == bc)
+ goto cal_neighbors;
+
+ for (j = 1; j < 127; j++)
+ {
+ tr = br;
+ tc = bc;
+ tk = k;
+
+ for (i = 0; i < 3; i++)
+ {
+ int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ //CHECK_BETTER(thiserr,nr,nc);
+ if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+ {
+ besterr = thiserr;
+ br = nr;
+ bc = nc; //k=(tk+5+i)%6;}
+ k = tk + 5 + i;
+
+ if (k >= 12) k -= 12;
+ else if (k >= 6) k -= 6;
+ }
+ }
+
+ if (tr == br && tc == bc)
+ break;
+ }
+
+ // check 8 1 away neighbors
+cal_neighbors:
+ tr = br;
+ tc = bc;
+
+ for (i = 0; i < 8; i++)
+ {
+ int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ CHECK_BETTER(thiserr, nr, nc);
+ }
+
+ best_mv->row = br;
+ best_mv->col = bc;
+
+ return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+
+#else
+
+#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
+#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
+
+int vp8_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_t vf,
+ vp8_sad_fn_t sf,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
+ MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+ int i, j;
+ unsigned char *src = (*(b->base_src) + b->src);
+ int src_stride = b->src_stride;
+ //int rr= ref_mv->row,rc= ref_mv->col,br=rr,bc=rc,tr,tc;
+ int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+ unsigned int besterr, thiserr = 0x7fffffff;
+
+ /*
+ if ( rc < x->mv_col_min) bc = x->mv_col_min;
+ if ( rc > x->mv_col_max) bc = x->mv_col_max;
+ if ( rr < x->mv_row_min) br = x->mv_row_min;
+ if ( rr > x->mv_row_max) br = x->mv_row_max;
+ rr>>=1;
+ rc>>=1;
+ br>>=3;
+ bc>>=3;
+ */
+ if (bc < x->mv_col_min) bc = x->mv_col_min;
+
+ if (bc > x->mv_col_max) bc = x->mv_col_max;
+
+ if (br < x->mv_row_min) br = x->mv_row_min;
+
+ if (br > x->mv_row_max) br = x->mv_row_max;
+
+ rr >>= 1;
+ rc >>= 1;
+
+ besterr = ERR(br, bc, thiserr);
+
+ // hex search jbb changed to 127 to avoid max 256 problem steping by 2.
+ for (j = 0; j < 127; j++)
+ {
+ tr = br;
+ tc = bc;
+
+ for (i = 0; i < 6; i++)
+ {
+ int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ CHECK_BETTER(thiserr, nr, nc);
+ }
+
+ if (tr == br && tc == bc)
+ break;
+ }
+
+ // check 8 1 away neighbors
+ tr = br;
+ tc = bc;
+
+ for (i = 0; i < 8; i++)
+ {
+ int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ CHECK_BETTER(thiserr, nr, nc);
+ }
+
+ best_mv->row = br;
+ best_mv->col = bc;
+
+ return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+
+#endif
+
+int vp8_diamond_search_sad
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ MV this_mv;
+
+ int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+ int thissad;
+
+ // Work out the start point for the search
+ in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+ best_address = in_what;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // search_param determines the length of the initial step and hence the number of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ *num00 = 0;
+
+ for (step = 0; step < tot_steps ; step++)
+ {
+ for (j = 0 ; j < x->searches_per_step ; j++)
+ {
+ // Trap illegal vectors
+ this_row_offset = best_mv->row + ss[i].mv.row;
+ this_col_offset = best_mv->col + ss[i].mv.col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+ {
+ check_here = ss[i].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.row = this_row_offset << 3;
+ this_mv.col = this_col_offset << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site)
+ {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ }
+ else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+int vp8_diamond_search_sadx4
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ MV this_mv;
+
+ int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+ int thissad;
+
+ // Work out the start point for the search
+ in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+ best_address = in_what;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // search_param determines the length of the initial step and hence the number of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ *num00 = 0;
+
+ for (step = 0; step < tot_steps ; step++)
+ {
+ int check_row_min, check_col_min, check_row_max, check_col_max;
+
+ check_row_min = x->mv_row_min - best_mv->row;
+ check_row_max = x->mv_row_max - best_mv->row;
+ check_col_min = x->mv_col_min - best_mv->col;
+ check_col_max = x->mv_col_max - best_mv->col;
+
+ for (j = 0 ; j < x->searches_per_step ; j += 4)
+ {
+ char *block_offset[4];
+ unsigned int valid_block[4];
+ int all_in = 1, t;
+
+ for (t = 0; t < 4; t++)
+ {
+ valid_block [t] = (ss[t+i].mv.col > check_col_min);
+ valid_block [t] &= (ss[t+i].mv.col < check_col_max);
+ valid_block [t] &= (ss[t+i].mv.row > check_row_min);
+ valid_block [t] &= (ss[t+i].mv.row < check_row_max);
+
+ all_in &= valid_block[t];
+ block_offset[t] = ss[i+t].offset + best_address;
+ }
+
+ if (all_in)
+ {
+ int sad_array[4];
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+ for (t = 0; t < 4; t++, i++)
+ {
+ thissad = sad_array[t];
+
+ if (thissad < bestsad)
+ {
+ this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
+ this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ }
+ else
+ {
+ int t;
+
+ for (t = 0; t < 4; i++, t++)
+ {
+ // Trap illegal vectors
+ if (valid_block[t])
+
+ {
+ check_here = block_offset[t];
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_row_offset = best_mv->row + ss[i].mv.row;
+ this_col_offset = best_mv->col + ss[i].mv.col;
+
+ this_mv.row = this_row_offset << 3;
+ this_mv.col = this_col_offset << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site != last_site)
+ {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ }
+ else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+
+#if !(CONFIG_REALTIME_ONLY)
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ MV *best_mv = &d->bmi.mv.as_mv;
+ MV this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ int thissad;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Baseline value at the centre
+
+ //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.row = r << 3;
+ check_here = r * mv_stride + in_what + col_min;
+
+ for (c = col_min; c < col_max; c++)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ this_mv.col = c << 3;
+ //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
+ //thissad += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+
+ check_here++;
+ }
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad < INT_MAX)
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+ else
+ return INT_MAX;
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ MV *best_mv = &d->bmi.mv.as_mv;
+ MV this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ int thissad;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ int sad_array[3];
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Baseline value at the centre
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.row = r << 3;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 3) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++)
+ {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here ++;
+ c ++;
+ }
+
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad < INT_MAX)
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+ else
+ return INT_MAX;
+}
+#endif
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void)
+{
+ FILE *f = fopen("modecont.c", "w");
+ int i, j;
+
+ fprintf(f, "#include \"entropy.h\"\n");
+ fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+ fprintf(f, "{\n");
+
+ for (j = 0; j < 6; j++)
+ {
+ fprintf(f, " { // %d \n", j);
+ fprintf(f, " ");
+
+ for (i = 0; i < 4; i++)
+ {
+ int overal_prob;
+ int this_prob;
+ int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
+
+ // Overall probs
+ count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+ if (count)
+ overal_prob = 256 * mv_mode_cts[i][0] / count;
+ else
+ overal_prob = 128;
+
+ if (overal_prob == 0)
+ overal_prob = 1;
+
+ // context probs
+ count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+ if (count)
+ this_prob = 256 * mv_ref_ct[j][i][0] / count;
+ else
+ this_prob = 128;
+
+ if (this_prob == 0)
+ this_prob = 1;
+
+ fprintf(f, "%5d, ", this_prob);
+ //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
+ //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "};\n");
+ fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+#ifdef ENTROPY_STATS
+void init_mv_ref_counts()
+{
+ vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+ vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+ if (m == ZEROMV)
+ {
+ ++mv_ref_ct [ct[0]] [0] [0];
+ ++mv_mode_cts[0][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[0]] [0] [1];
+ ++mv_mode_cts[0][1];
+
+ if (m == NEARESTMV)
+ {
+ ++mv_ref_ct [ct[1]] [1] [0];
+ ++mv_mode_cts[1][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[1]] [1] [1];
+ ++mv_mode_cts[1][1];
+
+ if (m == NEARMV)
+ {
+ ++mv_ref_ct [ct[2]] [2] [0];
+ ++mv_mode_cts[2][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[2]] [2] [1];
+ ++mv_mode_cts[2][1];
+
+ if (m == NEWMV)
+ {
+ ++mv_ref_ct [ct[3]] [3] [0];
+ ++mv_mode_cts[3][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[3]] [3] [1];
+ ++mv_mode_cts[3][1];
+ }
+ }
+ }
+ }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
+
+#endif
diff --git a/vp8/encoder/arm/neon/boolhuff_armv7.asm b/vp8/encoder/arm/neon/boolhuff_armv7.asm
new file mode 100644
index 000000000..9a5f36661
--- /dev/null
+++ b/vp8/encoder/arm/neon/boolhuff_armv7.asm
@@ -0,0 +1,292 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_start_encode|
+ EXPORT |vp8_encode_bool|
+ EXPORT |vp8_stop_encode|
+ EXPORT |vp8_encode_value|
+
+ INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 BOOL_CODER *br
+; r1 unsigned char *source
+
+|vp8_start_encode| PROC
+ mov r12, #0
+ mov r3, #255
+ mvn r2, #23
+ str r12, [r0, #vp8_writer_lowvalue]
+ str r3, [r0, #vp8_writer_range]
+ str r12, [r0, #vp8_writer_value]
+ str r2, [r0, #vp8_writer_count]
+ str r12, [r0, #vp8_writer_pos]
+ str r1, [r0, #vp8_writer_buffer]
+ bx lr
+ ENDP
+
+; r0 BOOL_CODER *br
+; r1 int bit
+; r2 int probability
+|vp8_encode_bool| PROC
+ push {r4-r9, lr}
+
+ mov r4, r2
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+ sub r7, r5, #1 ; range-1
+
+ cmp r1, #0
+ mul r4, r4, r7 ; ((range-1) * probability)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * probability) >> 8)
+
+ addne r2, r2, r4 ; if (bit) lowvalue += split
+ subne r4, r5, r4 ; if (bit) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r1, [r7, r4]
+ cmpge r1, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r1, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r1, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ pop {r4-r9, pc}
+ ENDP
+
+; r0 BOOL_CODER *br
+|vp8_stop_encode| PROC
+ push {r4-r10, lr}
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+ mov r10, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r1, [r7, r4]
+ cmpge r1, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r1, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r1, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r10, r10, #1
+ bne stop_encode_loop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ pop {r4-r10, pc}
+
+ ENDP
+
+; r0 BOOL_CODER *br
+; r1 int data
+; r2 int bits
+|vp8_encode_value| PROC
+ push {r4-r11, lr}
+
+ mov r10, r2
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+ ; reverse the stream of bits to be packed. Normally
+ ; the most significant bit is peeled off and compared
+ ; in the form of (v >> --n) & 1. ARM architecture has
+ ; the ability to set a flag based on the value of the
+ ; bit shifted off the bottom of the register. To make
+ ; that happen the bitstream is reversed.
+ rbit r11, r1
+ rsb r4, r10, #32 ; 32-n
+
+ ; v is kept in r1 during the token pack loop
+ lsr r1, r11, r4 ; v >>= 32 - n
+
+encode_value_loop
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsrs r1, r1, #1 ; bit = v >> n
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ addcs r2, r2, r4 ; if (bit) lowvalue += split
+ subcs r4, r5, r4 ; if (bit) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_ev ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_ev
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_ev
+token_zero_while_loop_ev
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_ev
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_ev
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_ev
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_ev
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r10, r10, #1
+ bne encode_value_loop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ pop {r4-r11, pc}
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
new file mode 100644
index 000000000..d5dec440d
--- /dev/null
+++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
@@ -0,0 +1,126 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_fdct4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+;NOTE:
+;The input *src_diff. src_diff is calculated as:
+;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
+;In which *src_ptr and *pred_ptr both are unsigned char.
+;Therefore, *src_diff should be in the range of [-255, 255].
+;CAUTION:
+;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
+;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
+;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
+
+|vp8_fast_fdct4x4_neon| PROC
+ vld1.16 {d2}, [r0], r2 ;load input
+ ldr r12, _ffdct_coeff_
+ vld1.16 {d3}, [r0], r2
+ vld1.16 {d4}, [r0], r2
+ vld1.16 {d0}, [r12]
+ vld1.16 {d5}, [r0], r2
+
+ ;First for-loop
+ ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vadd.s16 d6, d2, d5 ;ip[0]+ip[3]
+ vadd.s16 d7, d3, d4 ;ip[1]+ip[2]
+ vsub.s16 d8, d3, d4 ;ip[1]-ip[2]
+ vsub.s16 d9, d2, d5 ;ip[0]-ip[3]
+ vshl.i16 q3, q3, #1 ; a1, b1
+ vshl.i16 q4, q4, #1 ; c1, d1
+
+ vadd.s16 d10, d6, d7 ;temp1 = a1 + b1
+ vsub.s16 d11, d6, d7 ;temp2 = a1 - b1
+
+ vqdmulh.s16 q6, q5, d0[1]
+ vqdmulh.s16 q8, q4, d0[0]
+ vqdmulh.s16 q7, q4, d0[2]
+
+ vshr.s16 q6, q6, #1
+ vshr.s16 q8, q8, #1
+ vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
+ vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
+
+ vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+ vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+ vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
+ vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2
+
+ ;Second for-loop
+ ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12]
+ vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8]
+ vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8]
+ vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12]
+
+ vadd.s16 d10, d6, d7 ;temp1 = a1 + b1
+ vsub.s16 d11, d6, d7 ;temp2 = a1 - b1
+
+
+ vqdmulh.s16 q6, q5, d0[1]
+ vqdmulh.s16 q8, q4, d0[0]
+ vqdmulh.s16 q7, q4, d0[2]
+
+ vshr.s16 q6, q6, #1
+ vshr.s16 q8, q8, #1
+ vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
+ vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
+
+ vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1
+ vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2
+ vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
+ vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2
+
+ vclt.s16 q3, q1, #0
+ vclt.s16 q4, q2, #0
+
+ vsub.s16 q1, q1, q3
+ vsub.s16 q2, q2, q4
+
+ vshr.s16 q1, q1, #1
+ vshr.s16 q2, q2, #1
+
+ vst1.16 {q1, q2}, [r1]
+
+ bx lr
+
+ ENDP
+
+;-----------------
+ AREA fastfdct_dat, DATA, READONLY
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_ffdct_coeff_
+ DCD ffdct_coeff
+ffdct_coeff
+; 60547 = 0xEC83
+; 46341 = 0xB505
+; 25080 = 0x61F8
+ DCD 0xB505EC83, 0x000061F8
+
+ END
diff --git a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
new file mode 100644
index 000000000..de1c25469
--- /dev/null
+++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
@@ -0,0 +1,179 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_fdct8x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+;NOTE:
+;The input *src_diff. src_diff is calculated as:
+;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
+;In which *src_ptr and *pred_ptr both are unsigned char.
+;Therefore, *src_diff should be in the range of [-255, 255].
+;CAUTION:
+;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
+;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
+;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
+
+|vp8_fast_fdct8x4_neon| PROC
+ vld1.16 {q1}, [r0], r2 ;load input
+ ldr r12, _ffdct8_coeff_
+ vld1.16 {q2}, [r0], r2
+ vld1.16 {q3}, [r0], r2
+ vld1.16 {d0}, [r12]
+ vld1.16 {q4}, [r0], r2
+
+ ;First for-loop
+ ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
+ ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
+ vtrn.32 d2, d6
+ vtrn.32 d3, d7
+ vtrn.32 d4, d8
+ vtrn.32 d5, d9
+ vtrn.16 d2, d4
+ vtrn.16 d3, d5
+ vtrn.16 d6, d8
+ vtrn.16 d7, d9
+
+ vadd.s16 d10, d2, d8 ;ip[0]+ip[3]
+ vadd.s16 d11, d4, d6 ;ip[1]+ip[2]
+ vsub.s16 d12, d4, d6 ;ip[1]-ip[2]
+ vsub.s16 d13, d2, d8 ;ip[0]-ip[3]
+ vadd.s16 d22, d3, d9
+ vadd.s16 d23, d5, d7
+ vsub.s16 d24, d5, d7
+ vsub.s16 d25, d3, d9
+
+ vshl.i16 q5, q5, #1 ; a1, b1
+ vshl.i16 q6, q6, #1 ; c1, d1
+ vshl.i16 q1, q11, #1
+ vshl.i16 q2, q12, #1
+
+ vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
+ vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
+ vadd.s16 d24, d2, d3
+ vsub.s16 d25, d2, d3
+
+ vqdmulh.s16 q8, q7, d0[1]
+ vqdmulh.s16 q13, q12, d0[1]
+ vqdmulh.s16 q10, q6, d0[0]
+ vqdmulh.s16 q15, q2, d0[0]
+ vqdmulh.s16 q9, q6, d0[2]
+ vqdmulh.s16 q14, q2, d0[2]
+
+ vshr.s16 q8, q8, #1
+ vshr.s16 q13, q13, #1
+ vshr.s16 q10, q10, #1
+ vshr.s16 q15, q15, #1
+ vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
+ vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
+ vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
+ vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
+
+ vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+ vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+ vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+ vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+ vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
+ vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
+ vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2
+ vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2
+
+ ;Second for-loop
+ ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
+ ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
+ vtrn.32 d2, d6
+ vtrn.32 d3, d7
+ vtrn.32 d4, d8
+ vtrn.32 d5, d9
+ vtrn.16 d2, d4
+ vtrn.16 d3, d5
+ vtrn.16 d6, d8
+ vtrn.16 d7, d9
+
+ vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12]
+ vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8]
+ vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8]
+ vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12]
+ vadd.s16 d2, d3, d9
+ vadd.s16 d4, d5, d7
+ vsub.s16 d24, d5, d7
+ vsub.s16 d25, d3, d9
+
+ vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
+ vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
+ vadd.s16 d22, d2, d4
+ vsub.s16 d23, d2, d4
+
+ vqdmulh.s16 q8, q7, d0[1]
+ vqdmulh.s16 q13, q11, d0[1]
+ vqdmulh.s16 q10, q6, d0[0]
+ vqdmulh.s16 q15, q12, d0[0]
+ vqdmulh.s16 q9, q6, d0[2]
+ vqdmulh.s16 q14, q12, d0[2]
+
+ vshr.s16 q8, q8, #1
+ vshr.s16 q13, q13, #1
+ vshr.s16 q10, q10, #1
+ vshr.s16 q15, q15, #1
+ vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
+ vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
+ vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
+ vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
+
+ vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1
+ vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1
+ vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2
+ vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2
+ vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
+ vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
+ vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2
+ vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2
+
+ vclt.s16 q5, q1, #0
+ vclt.s16 q6, q2, #0
+ vclt.s16 q7, q3, #0
+ vclt.s16 q8, q4, #0
+
+ vsub.s16 q1, q1, q5
+ vsub.s16 q2, q2, q6
+ vsub.s16 q3, q3, q7
+ vsub.s16 q4, q4, q8
+
+ vshr.s16 q1, q1, #1
+ vshr.s16 q2, q2, #1
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vst1.16 {q1, q2}, [r1]!
+ vst1.16 {q3, q4}, [r1]
+
+ bx lr
+
+ ENDP
+
+;-----------------
+ AREA fastfdct8x4_dat, DATA, READONLY
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_ffdct8_coeff_
+ DCD ffdct8_coeff
+ffdct8_coeff
+; 60547 = 0xEC83
+; 46341 = 0xB505
+; 25080 = 0x61F8
+ DCD 0xB505EC83, 0x000061F8
+
+ END
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
new file mode 100644
index 000000000..11070377b
--- /dev/null
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@ -0,0 +1,117 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_neon_func|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 short *coeff_ptr
+; r1 short *zbin_ptr
+; r2 short *qcoeff_ptr
+; r3 short *dqcoeff_ptr
+; stack short *dequant_ptr
+; stack short *scan_mask
+; stack short *round_ptr
+; stack short *quant_ptr
+
+; return int * eob
+|vp8_fast_quantize_b_neon_func| PROC
+ vld1.16 {q0, q1}, [r0] ;load z
+ vld1.16 {q10, q11}, [r1] ;load zbin
+
+ vabs.s16 q4, q0 ;calculate x = abs(z)
+ vabs.s16 q5, q1
+
+ vcge.s16 q10, q4, q10 ;x>=zbin
+ vcge.s16 q11, q5, q11
+
+ ;if x<zbin (q10 & q11 are all 0), go to zero_output
+ vorr.s16 q6, q10, q11
+ vorr.s16 d12, d12, d13
+ vmov r0, r1, d12
+ orr r0, r0, r1
+ cmp r0, #0
+ beq zero_output
+
+ ldr r0, [sp, #8] ;load round_ptr
+ ldr r12, [sp, #12] ;load quant_ptr
+
+ ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+ vshr.s16 q2, q0, #15 ; sz
+ vshr.s16 q3, q1, #15
+
+ vld1.s16 {q6, q7}, [r0] ;load round_ptr [0-15]
+ vld1.s16 {q8, q9}, [r12] ;load quant_ptr [0-15]
+
+ vadd.s16 q4, q6 ;x + Round
+ vadd.s16 q5, q7
+
+ ldr r0, [sp, #4] ;load rvsplus1_scan_order ptr
+
+ vqdmulh.s16 q4, q8 ;y = ((Round + abs(z)) * Quant) >> 16
+ vqdmulh.s16 q5, q9
+
+ vld1.16 {q0, q1}, [r0] ;load rvsplus1_scan_order
+ vceq.s16 q8, q8 ;set q8 to all 1
+
+ vshr.s16 q4, #1 ;right shift 1 after vqdmulh
+ vshr.s16 q5, #1
+
+ ;modify data to have its original sign
+ veor.s16 q4, q2 ; y^sz
+ veor.s16 q5, q3
+
+ ldr r12, [sp] ;load dequant_ptr
+
+ vsub.s16 q4, q2 ; x1 = (y^sz) - sz = (y^sz) - (-1) (two's complement)
+ vsub.s16 q5, q3
+
+ vand.s16 q4, q10 ;mask off x1 elements
+ vand.s16 q5, q11
+
+ vld1.s16 {q6, q7}, [r12] ;load dequant_ptr[i]
+
+ vtst.16 q14, q4, q8 ;now find eob
+ vtst.16 q15, q5, q8 ;non-zero element is set to all 1 in q4, q5
+
+ vst1.s16 {q4, q5}, [r2] ;store: qcoeff = x1
+
+ vand q0, q0, q14 ;get all valid number from rvsplus1_scan_order array
+ vand q1, q1, q15
+
+ vmax.u16 q0, q0, q1 ;find maximum value in q0, q1
+ vmax.u16 d0, d0, d1
+ vmovl.u16 q0, d0
+
+ vmul.s16 q6, q4 ;x * Dequant
+ vmul.s16 q7, q5
+
+ vmax.u32 d0, d0, d1
+ vpmax.u32 d0, d0, d0
+
+ vst1.s16 {q6, q7}, [r3] ;store dqcoeff = x * Dequant
+
+ vmov.32 r0, d0[0]
+ bx lr
+
+zero_output
+ vst1.s16 {q10, q11}, [r2] ; qcoeff = 0
+ vst1.s16 {q10, q11}, [r3] ; dqcoeff = 0
+ mov r0, #0
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/sad16_neon.asm b/vp8/encoder/arm/neon/sad16_neon.asm
new file mode 100644
index 000000000..6169f10da
--- /dev/null
+++ b/vp8/encoder/arm/neon/sad16_neon.asm
@@ -0,0 +1,206 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad16x16_neon|
+ EXPORT |vp8_sad16x8_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int src_stride
+; r2 unsigned char *ref_ptr
+; r3 int ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+ vabdl.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0]
+ vld1.8 {q7}, [r2]
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vadd.u16 q0, q12, q13
+
+ vpaddl.u16 q1, q0
+ vpaddl.u32 q0, q1
+
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+|vp8_sad16x8_neon| PROC
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+ vabdl.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vadd.u16 q0, q12, q13
+
+ vpaddl.u16 q1, q0
+ vpaddl.u32 q0, q1
+
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/sad8_neon.asm b/vp8/encoder/arm/neon/sad8_neon.asm
new file mode 100644
index 000000000..28604ddeb
--- /dev/null
+++ b/vp8/encoder/arm/neon/sad8_neon.asm
@@ -0,0 +1,208 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad8x8_neon|
+ EXPORT |vp8_sad8x16_neon|
+ EXPORT |vp8_sad4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad8x8_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 q1, q12
+ vpaddl.u32 q0, q1
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad8x16_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 q1, q12
+ vpaddl.u32 q0, q1
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad4x4_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 d1, d24
+ vpaddl.u32 d0, d1
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm
new file mode 100644
index 000000000..26bc0d06c
--- /dev/null
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -0,0 +1,146 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_fdct4x4_neon|
+ EXPORT |vp8_short_fdct8x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 short *input
+; r1 short *output
+; r2 int pitch
+; Input has a pitch, output is contiguous
+|vp8_short_fdct4x4_neon| PROC
+ ldr r12, _dct_matrix_
+ vld1.16 d0, [r0], r2
+ vld1.16 d1, [r0], r2
+ vld1.16 d2, [r0], r2
+ vld1.16 d3, [r0]
+ vld1.16 {q2, q3}, [r12]
+
+;first stage
+ vmull.s16 q11, d4, d0[0] ;i=0
+ vmull.s16 q12, d4, d1[0] ;i=1
+ vmull.s16 q13, d4, d2[0] ;i=2
+ vmull.s16 q14, d4, d3[0] ;i=3
+
+ vmlal.s16 q11, d5, d0[1]
+ vmlal.s16 q12, d5, d1[1]
+ vmlal.s16 q13, d5, d2[1]
+ vmlal.s16 q14, d5, d3[1]
+
+ vmlal.s16 q11, d6, d0[2]
+ vmlal.s16 q12, d6, d1[2]
+ vmlal.s16 q13, d6, d2[2]
+ vmlal.s16 q14, d6, d3[2]
+
+ vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0
+ vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1
+ vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2
+ vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3
+
+ ; rounding
+ vrshrn.i32 d22, q11, #14
+ vrshrn.i32 d24, q12, #14
+ vrshrn.i32 d26, q13, #14
+ vrshrn.i32 d28, q14, #14
+
+;second stage
+ vmull.s16 q4, d22, d4[0] ;i=0
+ vmull.s16 q5, d22, d4[1] ;i=1
+ vmull.s16 q6, d22, d4[2] ;i=2
+ vmull.s16 q7, d22, d4[3] ;i=3
+
+ vmlal.s16 q4, d24, d5[0]
+ vmlal.s16 q5, d24, d5[1]
+ vmlal.s16 q6, d24, d5[2]
+ vmlal.s16 q7, d24, d5[3]
+
+ vmlal.s16 q4, d26, d6[0]
+ vmlal.s16 q5, d26, d6[1]
+ vmlal.s16 q6, d26, d6[2]
+ vmlal.s16 q7, d26, d6[3]
+
+ vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0
+ vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1
+ vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2
+ vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3
+
+ vrshr.s32 q0, q4, #16
+ vrshr.s32 q1, q5, #16
+ vrshr.s32 q2, q6, #16
+ vrshr.s32 q3, q7, #16
+
+ vmovn.i32 d0, q0
+ vmovn.i32 d1, q1
+ vmovn.i32 d2, q2
+ vmovn.i32 d3, q3
+
+ vst1.16 {q0, q1}, [r1]
+
+ bx lr
+
+ ENDP
+
+; r0 short *input
+; r1 short *output
+; r2 int pitch
+|vp8_short_fdct8x4_neon| PROC
+ ; Store link register and input before calling
+ ; first 4x4 fdct. Do not need to worry about
+ ; output or pitch because those pointers are not
+ ; touched in the 4x4 fdct function
+ stmdb sp!, {r0, lr}
+
+ bl vp8_short_fdct4x4_neon
+
+ ldmia sp!, {r0, lr}
+
+ ; Move to the next block of data.
+ add r0, r0, #8
+ add r1, r1, #32
+
+ ; Second time through do not store off the
+ ; link register, just return from the 4x4 fdtc
+ b vp8_short_fdct4x4_neon
+
+ ; Should never get to this.
+ bx lr
+
+ ENDP
+
+;-----------------
+ AREA dct4x4_dat, DATA, READONLY
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_dct_matrix_
+ DCD dct_matrix
+dct_matrix
+; DCW 23170, 30274, 23170, 12540
+; DCW 23170, 12540, -23170,-30274
+; DCW 23170, -12540, -23170, 30274
+; DCW 23170, -30274, 23170,-12540
+; 23170 = 0x5a82
+; -23170 = 0xa57e
+; 30274 = 0x7642
+; -30274 = 0x89be
+; 12540 = 0x30fc
+; -12540 = 0xcf04
+ DCD 0x76425a82, 0x30fc5a82
+ DCD 0x30fc5a82, 0x89bea57e
+ DCD 0xcf045a82, 0x7642a57e
+ DCD 0x89be5a82, 0xcf045a82
+
+ END
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
new file mode 100644
index 000000000..8781ca0cc
--- /dev/null
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -0,0 +1,171 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_subtract_b_neon_func|
+ EXPORT |vp8_subtract_mby_neon|
+ EXPORT |vp8_subtract_mbuv_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
+|vp8_subtract_b_neon_func| PROC
+ ldr r12, [sp] ;load pitch
+
+ vld1.8 {d0}, [r1], r3 ;load src
+ vld1.8 {d1}, [r2], r12 ;load pred
+ vld1.8 {d2}, [r1], r3
+ vld1.8 {d3}, [r2], r12
+ vld1.8 {d4}, [r1], r3
+ vld1.8 {d5}, [r2], r12
+ vld1.8 {d6}, [r1], r3
+ vld1.8 {d7}, [r2], r12
+
+ vsubl.u8 q10, d0, d1
+ vsubl.u8 q11, d2, d3
+ vsubl.u8 q12, d4, d5
+ vsubl.u8 q13, d6, d7
+
+ mov r12, r12, lsl #1
+
+ vst1.16 {d20}, [r0], r12 ;store diff
+ vst1.16 {d22}, [r0], r12
+ vst1.16 {d24}, [r0], r12
+ vst1.16 {d26}, [r0], r12
+
+ bx lr
+ ENDP
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+|vp8_subtract_mby_neon| PROC
+ mov r12, #4
+
+subtract_mby_loop
+ vld1.8 {q0}, [r1], r3 ;load src
+ vld1.8 {q1}, [r2]! ;load pred
+ vld1.8 {q2}, [r1], r3
+ vld1.8 {q3}, [r2]!
+ vld1.8 {q4}, [r1], r3
+ vld1.8 {q5}, [r2]!
+ vld1.8 {q6}, [r1], r3
+ vld1.8 {q7}, [r2]!
+
+ vsubl.u8 q8, d0, d2
+ vsubl.u8 q9, d1, d3
+ vsubl.u8 q10, d4, d6
+ vsubl.u8 q11, d5, d7
+ vsubl.u8 q12, d8, d10
+ vsubl.u8 q13, d9, d11
+ vsubl.u8 q14, d12, d14
+ vsubl.u8 q15, d13, d15
+
+ vst1.16 {q8}, [r0]! ;store diff
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+
+ subs r12, r12, #1
+ bne subtract_mby_loop
+
+ bx lr
+ ENDP
+
+;=================================
+;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+|vp8_subtract_mbuv_neon| PROC
+ ldr r12, [sp]
+
+;u
+ add r0, r0, #512 ; short *udiff = diff + 256;
+ add r3, r3, #256 ; unsigned char *upred = pred + 256;
+
+ vld1.8 {d0}, [r1], r12 ;load src
+ vld1.8 {d1}, [r3]! ;load pred
+ vld1.8 {d2}, [r1], r12
+ vld1.8 {d3}, [r3]!
+ vld1.8 {d4}, [r1], r12
+ vld1.8 {d5}, [r3]!
+ vld1.8 {d6}, [r1], r12
+ vld1.8 {d7}, [r3]!
+ vld1.8 {d8}, [r1], r12
+ vld1.8 {d9}, [r3]!
+ vld1.8 {d10}, [r1], r12
+ vld1.8 {d11}, [r3]!
+ vld1.8 {d12}, [r1], r12
+ vld1.8 {d13}, [r3]!
+ vld1.8 {d14}, [r1], r12
+ vld1.8 {d15}, [r3]!
+
+ vsubl.u8 q8, d0, d1
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vsubl.u8 q11, d6, d7
+ vsubl.u8 q12, d8, d9
+ vsubl.u8 q13, d10, d11
+ vsubl.u8 q14, d12, d13
+ vsubl.u8 q15, d14, d15
+
+ vst1.16 {q8}, [r0]! ;store diff
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+
+;v
+ vld1.8 {d0}, [r2], r12 ;load src
+ vld1.8 {d1}, [r3]! ;load pred
+ vld1.8 {d2}, [r2], r12
+ vld1.8 {d3}, [r3]!
+ vld1.8 {d4}, [r2], r12
+ vld1.8 {d5}, [r3]!
+ vld1.8 {d6}, [r2], r12
+ vld1.8 {d7}, [r3]!
+ vld1.8 {d8}, [r2], r12
+ vld1.8 {d9}, [r3]!
+ vld1.8 {d10}, [r2], r12
+ vld1.8 {d11}, [r3]!
+ vld1.8 {d12}, [r2], r12
+ vld1.8 {d13}, [r3]!
+ vld1.8 {d14}, [r2], r12
+ vld1.8 {d15}, [r3]!
+
+ vsubl.u8 q8, d0, d1
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vsubl.u8 q11, d6, d7
+ vsubl.u8 q12, d8, d9
+ vsubl.u8 q13, d10, d11
+ vsubl.u8 q14, d12, d13
+ vsubl.u8 q15, d14, d15
+
+ vst1.16 {q8}, [r0]! ;store diff
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+
+ bx lr
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/variance_neon.asm b/vp8/encoder/arm/neon/variance_neon.asm
new file mode 100644
index 000000000..64b83ca43
--- /dev/null
+++ b/vp8/encoder/arm/neon/variance_neon.asm
@@ -0,0 +1,275 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance16x16_neon|
+ EXPORT |vp8_variance16x8_neon|
+ EXPORT |vp8_variance8x16_neon|
+ EXPORT |vp8_variance8x8_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+variance16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+ ;the results into the elements of the destination vector. The explanation
+ ;in ARM guide is wrong.
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ ;vmov.32 r0, d0[0] ;this instruction costs a lot
+ ;vmov.32 r1, d1[0]
+ ;mul r0, r0, r0
+ ;str r1, [r12]
+ ;sub r0, r1, r0, asr #8
+
+ ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
+ ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;================================
+;unsigned int vp8_variance16x8_c(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *sse)
+|vp8_variance16x8_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #4
+
+variance16x8_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance16x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #7
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;=================================
+;unsigned int vp8_variance8x16_c(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *sse)
+
+|vp8_variance8x16_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+variance8x16_neon_loop
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d2, d6
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+
+ bne variance8x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #7
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;==================================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+variance8x8_neon_loop
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #6
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
new file mode 100644
index 000000000..f26b4d7ae
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -0,0 +1,67 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_memcpy_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+|vp8_memcpy_neon| PROC
+ ;pld [r1] ;preload pred data
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ mov r12, r2, lsr #8 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+ vld1.8 {q0, q1}, [r1]! ;load src data
+ subs r12, r12, #1
+ vld1.8 {q2, q3}, [r1]!
+ vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
+ vld1.8 {q4, q5}, [r1]!
+ vst1.8 {q2, q3}, [r0]!
+ vld1.8 {q6, q7}, [r1]!
+ vst1.8 {q4, q5}, [r0]!
+ vld1.8 {q8, q9}, [r1]!
+ vst1.8 {q6, q7}, [r0]!
+ vld1.8 {q10, q11}, [r1]!
+ vst1.8 {q8, q9}, [r0]!
+ vld1.8 {q12, q13}, [r1]!
+ vst1.8 {q10, q11}, [r0]!
+ vld1.8 {q14, q15}, [r1]!
+ vst1.8 {q12, q13}, [r0]!
+ vst1.8 {q14, q15}, [r0]!
+
+ ;pld [r1] ;preload pred data -- need to adjust for real device
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ bne memcpy_neon_loop
+
+ ands r3, r2, #0xff ;extra copy
+ beq done_copy_neon_loop
+
+extra_copy_neon_loop
+ vld1.8 {q0}, [r1]! ;load src data
+ subs r3, r3, #16
+ vst1.8 {q0}, [r0]!
+ bne extra_copy_neon_loop
+
+done_copy_neon_loop
+ bx lr
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
new file mode 100644
index 000000000..f53596727
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -0,0 +1,172 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_neon|
+ EXPORT |vp8_get16x16pred_error_neon|
+ EXPORT |vp8_get4x4sse_cs_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp8_variance().
+
+|vp8_mse16x16_neon| PROC
+ vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
+ vmov.i8 q8, #0
+ vmov.i8 q9, #0
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+mse16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmlal.s16 q7, d22, d22
+ vmlal.s16 q8, d23, d23
+
+ subs r12, r12, #1
+
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vmlal.s16 q7, d26, d26
+ vmlal.s16 q8, d27, d27
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne mse16x16_neon_loop
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vadd.u32 q10, q7, q9
+ vpaddl.u32 q1, q10
+ vadd.u64 d0, d2, d3
+
+ vst1.32 {d0[0]}, [r12]
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;============================
+; r0 unsigned char *src_ptr
+; r1 int src_stride
+; r2 unsigned char *ref_ptr
+; r3 int ref_stride
+|vp8_get16x16pred_error_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - pred_error
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+get16x16pred_error_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11
+ vmlal.s16 q9, d22, d22
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne get16x16pred_error_neon_loop
+
+ vadd.u32 q10, q9, q10
+ vpaddl.s32 q0, q8
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0]
+ bx lr
+
+ ENDP
+
+;=============================
+; r0 unsigned char *src_ptr,
+; r1 int source_stride,
+; r2 unsigned char *ref_ptr,
+; r3 int recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmull.s16 q7, d22, d22
+ vmull.s16 q8, d24, d24
+ vmull.s16 q9, d26, d26
+ vmull.s16 q10, d28, d28
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+ vadd.u32 q9, q7, q9
+
+ vpaddl.u32 q1, q9
+ vadd.u64 d0, d2, d3
+
+ vmov.32 r0, d0[0]
+ bx lr
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm
new file mode 100644
index 000000000..9c52c52f6
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm
@@ -0,0 +1,300 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_armv7|
+
+ INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 vp8_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv7| PROC
+ push {r4-r11, lr}
+
+ ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+ ; sizeof (TOKENEXTRA) is 20
+ add r2, r2, r2, lsl #2 ; xcount
+ sub sp, sp, #12
+ add r2, r1, r2, lsl #2 ; stop = p + xcount
+ str r2, [sp, #0]
+ str r3, [sp, #8] ; save vp8_coef_encodings
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+ b check_p_lt_stop
+
+while_p_lt_stop
+ ldr r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #8] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldr r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp8_token_value] ; v
+ ldr r8, [r4, #vp8_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ ; reverse the stream of bits to be packed. Normally
+ ; the most significant bit is peeled off and compared
+ ; in the form of (v >> --n) & 1. ARM architecture has
+ ; the ability to set a flag based on the value of the
+ ; bit shifted off the bottom of the register. To make
+ ; that happen the bitstream is reversed.
+ rbit r12, r6
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #52] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsr r12, r12, r4 ; v >>= 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsrs r12, r12, #1 ; bb = v >> n
+ mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #52] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldr r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #48] ; vp8_extra_bits
+ ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+ ; element. Here vp8_extra_bit_struct == 20
+ add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t
+ add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp8_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
+ ldr lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp8_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp8_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rbit r12, r7 ; reverse v
+ rsb r4, r8, #32
+ lsr r12, r12, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsrs r12, r12, #1 ; v >> n
+ mul r4, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp8_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp8_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp8_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ add sp, sp, #12
+ pop {r4-r11, pc}
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm
new file mode 100644
index 000000000..92b098909
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm
@@ -0,0 +1,335 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_mb_row_tokens_armv7|
+
+ INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 vp8_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv7| PROC
+ push {r4-r11, lr}
+ sub sp, sp, #24
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r2, [sp, #20] ; save vp8_coef_encodings
+ str r5, [sp, #12] ; save mb_rows
+ str r3, [sp, #8] ; save vp8_extra_bits
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+
+ mov r0, r1 ; keep same as other loops
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actuall work gets done here!
+
+while_p_lt_stop
+ ldr r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #20] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldr r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp8_token_value] ; v
+ ldr r8, [r4, #vp8_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ ; reverse the stream of bits to be packed. Normally
+ ; the most significant bit is peeled off and compared
+ ; in the form of (v >> --n) & 1. ARM architecture has
+ ; the ability to set a flag based on the value of the
+ ; bit shifted off the bottom of the register. To make
+ ; that happen the bitstream is reversed.
+ rbit r12, r6
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsr r12, r12, r4 ; v >>= 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsrs r12, r12, #1 ; bb = v >> n
+ mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldr r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #8] ; vp8_extra_bits
+ ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+ ; element. Here vp8_extra_bit_struct == 20
+ add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t
+ add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp8_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
+ ldr lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp8_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp8_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rbit r12, r7 ; reverse v
+ rsb r4, r8, #32
+ lsr r12, r12, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsrs r12, r12, #1 ; v >> n
+ mul r4, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp8_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp8_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp8_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, #1
+ add r7, r7, #TOKENLIST_SZ ; next element in the array
+ str r6, [sp, #12]
+ bne mb_row_loop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ add sp, sp, #24
+ pop {r4-r11, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm
new file mode 100644
index 000000000..6d5f882ed
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm
@@ -0,0 +1,471 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_into_partitions_armv7|
+
+ INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 int num_part
+; r3 *size
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp8_tree_index *,
+
+|vp8cx_pack_tokens_into_partitions_armv7| PROC
+ push {r4-r11, lr}
+ sub sp, sp, #44
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r5, [sp, #36] ; save mb_rows
+ str r1, [sp, #24] ; save cx_data
+ str r2, [sp, #20] ; save num_part
+ str r3, [sp, #8] ; save *size
+
+ ; *size = 3*(num_part -1 );
+ sub r2, r2, #1 ; num_part - 1
+ add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
+ str r2, [r3]
+
+ add r2, r2, r1 ; cx_data + *size
+ str r2, [sp, #40] ; ptr
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+ str r7, [sp, #32] ; store start of cpi->tp_list
+
+ ldr r11, _VP8_COMP_bc2_ ; load up vp8_writer out of cpi
+ add r0, r0, r11
+
+ mov r11, #0
+ str r11, [sp, #28] ; i
+
+numparts_loop
+ ldr r10, [sp, #40] ; ptr
+ ldr r5, [sp, #36] ; move mb_rows to the counting section
+ str r5, [sp, #12]
+
+ ; Reset all of the VP8 Writer data for each partition that
+ ; is processed.
+ ; start_encode
+ mov r2, #0 ; vp8_writer_lowvalue
+ mov r5, #255 ; vp8_writer_range
+ mvn r3, #23 ; vp8_writer_count
+
+ str r2, [r0, #vp8_writer_value]
+ str r2, [r0, #vp8_writer_pos]
+ str r10, [r0, #vp8_writer_buffer]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actual work gets done here!
+
+while_p_lt_stop
+ ldr r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #80] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldr r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp8_token_value] ; v
+ ldr r8, [r4, #vp8_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ ; reverse the stream of bits to be packed. Normally
+ ; the most significant bit is peeled off and compared
+ ; in the form of (v >> --n) & 1. ARM architecture has
+ ; the ability to set a flag based on the value of the
+ ; bit shifted off the bottom of the register. To make
+ ; that happen the bitstream is reversed.
+ rbit r12, r6
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsr r12, r12, r4 ; v >>= 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsrs r12, r12, #1 ; bb = v >> n
+ mul r4, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldr r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #84] ; vp8_extra_bits
+ ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+ ; element. Here vp8_extra_bit_struct == 20
+ add r6, r6, r6, lsl #2 ; b = vp8_extra_bits + t
+ add r12, r7, r6, lsl #2 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp8_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
+ ldr lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp8_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp8_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rbit r12, r7 ; reverse v
+ rsb r4, r8, #32
+ lsr r12, r12, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsrs r12, r12, #1 ; v >> n
+ mul r4, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r4, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp8_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp8_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp8_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r10, [sp, #20] ; num_parts
+ mov r1, #TOKENLIST_SZ
+ mul r1, r10, r1
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, r10
+ add r7, r7, r1 ; next element in the array
+ str r6, [sp, #12]
+ bgt mb_row_loop
+
+ mov r12, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r12, r12, #1
+ bne stop_encode_loop
+
+ ldr r10, [sp, #8] ; *size
+ ldr r11, [r10]
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ add r11, r11, r4 ; *size += w->pos
+ str r11, [r10]
+
+ ldr r9, [sp, #20] ; num_parts
+ sub r9, r9, #1
+ ldr r10, [sp, #28] ; i
+ cmp r10, r9 ; if(i<(num_part - 1))
+ bge skip_write_partition
+
+ ldr r12, [sp, #40] ; ptr
+ add r12, r12, r4 ; ptr += w->pos
+ str r12, [sp, #40]
+
+ ldr r9, [sp, #24] ; cx_data
+ mov r8, r4, asr #8
+ strb r4, [r9, #0]
+ strb r8, [r9, #1]
+ mov r4, r4, asr #16
+ strb r4, [r9, #2]
+
+ add r9, r9, #3 ; cx_data += 3
+ str r9, [sp, #24]
+
+skip_write_partition
+
+ ldr r11, [sp, #28] ; i
+ ldr r10, [sp, #20] ; num_parts
+
+ add r11, r11, #1 ; i++
+ str r11, [sp, #28]
+
+ ldr r7, [sp, #32] ; cpi->tp_list[i]
+ mov r1, #TOKENLIST_SZ
+ add r7, r7, r1 ; next element in cpi->tp_list
+ str r7, [sp, #32] ; cpi->tp_list[i+1]
+
+ cmp r10, r11
+ bgt numparts_loop
+
+
+ add sp, sp, #44
+ pop {r4-r11, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+_VP8_COMP_bc2_
+ DCD vp8_comp_bc2
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
new file mode 100644
index 000000000..5269c0af8
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -0,0 +1,75 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_walsh4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+
+|vp8_short_walsh4x4_neon| PROC
+ vld1.16 {d2}, [r0], r2 ;load input
+ vld1.16 {d3}, [r0], r2
+ vld1.16 {d4}, [r0], r2
+ vld1.16 {d5}, [r0], r2
+
+ ;First for-loop
+ ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[3]
+ vadd.s16 d7, d3, d4 ;b1 = ip[1]+ip[2]
+ vsub.s16 d8, d3, d4 ;c1 = ip[1]-ip[2]
+ vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[3]
+
+ vadd.s16 d2, d6, d7 ;op[0] = a1 + b1
+ vsub.s16 d4, d6, d7 ;op[2] = a1 - b1
+ vadd.s16 d3, d8, d9 ;op[1] = c1 + d1
+ vsub.s16 d5, d9, d8 ;op[3] = d1 - c1
+
+ ;Second for-loop
+ ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12]
+ vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8]
+ vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8]
+ vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12]
+
+ vadd.s16 d2, d6, d7 ;a2 = a1 + b1;
+ vsub.s16 d4, d6, d7 ;c2 = a1 - b1;
+ vadd.s16 d3, d8, d9 ;b2 = c1 + d1;
+ vsub.s16 d5, d9, d8 ;d2 = d1 - c1;
+
+ vcgt.s16 q3, q1, #0
+ vcgt.s16 q4, q2, #0
+
+ vsub.s16 q1, q1, q3
+ vsub.s16 q2, q2, q4
+
+ vshr.s16 q1, q1, #1
+ vshr.s16 q2, q2, #1
+
+ vst1.16 {q1, q2}, [r1]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
new file mode 100644
index 000000000..aec716e3b
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -0,0 +1,427 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sub_pixel_variance16x16_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
+
+|vp8_sub_pixel_variance16x16_neon| PROC
+ push {r4-r6, lr}
+
+ ldr r12, _BilinearTaps_coeff_
+ ldr r4, [sp, #16] ;load *dst_ptr from stack
+ ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
+ ldr r6, [sp, #24] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16_only
+
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {d31}, [r2] ;load first_pass filter
+
+ beq firstpass_bfilter16x16_only
+
+ sub sp, sp, #272 ;reserve space on stack for temporary storage
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ mov lr, sp
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ mov r2, #3 ;loop counter
+ vld1.u8 {d8, d9, d10}, [r0], r1
+
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vqrshrn.u16 d21, q14, #7
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vst1.u8 {d18, d19, d20, d21}, [lr]!
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ bne vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+ vld1.u8 {d14, d15, d16}, [r0], r1
+
+ vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q10, d3, d0
+ vmull.u8 q11, d5, d0
+ vmull.u8 q12, d6, d0
+ vmull.u8 q13, d8, d0
+ vmull.u8 q14, d9, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+
+ vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q11, d5, d1
+ vmlal.u8 q13, d8, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+
+ vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q12, d6, d1
+ vmlal.u8 q14, d9, d1
+
+ vmull.u8 q1, d11, d0
+ vmull.u8 q2, d12, d0
+ vmull.u8 q3, d14, d0
+ vmull.u8 q4, d15, d0
+
+ vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
+ vext.8 d14, d14, d15, #1
+
+ vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q3, d14, d1
+
+ vext.8 d12, d12, d13, #1
+ vext.8 d15, d15, d16, #1
+
+ vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q4, d15, d1
+
+ vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d11, q10, #7
+ vqrshrn.u16 d12, q11, #7
+ vqrshrn.u16 d13, q12, #7
+ vqrshrn.u16 d14, q13, #7
+ vqrshrn.u16 d15, q14, #7
+ vqrshrn.u16 d16, q1, #7
+ vqrshrn.u16 d17, q2, #7
+ vqrshrn.u16 d18, q3, #7
+ vqrshrn.u16 d19, q4, #7
+
+ vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
+ vst1.u8 {d14, d15, d16, d17}, [lr]!
+ vst1.u8 {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ sub lr, lr, #272
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ sub sp, sp, #256
+ mov r3, sp
+
+ vld1.u8 {d22, d23}, [lr]! ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r12, #4 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+ vld1.u8 {d24, d25}, [lr]!
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [lr]!
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [lr]!
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [lr]!
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ subs r12, r12, #1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5}, [r3]!
+ vst1.u8 {d6, d7}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_sp16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+ mov r2, #4 ;loop counter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vld1.u8 {d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+ vst1.u8 {d14, d15}, [r3]! ;store result
+ vqrshrn.u16 d21, q14, #7
+
+ vst1.u8 {d16, d17}, [r3]!
+ vst1.u8 {d18, d19}, [r3]!
+ vst1.u8 {d20, d21}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ add r3, r12, r3, lsl #3
+ mov r12, #4 ;loop counter
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ vld1.u8 {d22, d23}, [r0], r1 ;load src data
+ mov r3, sp
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+ vld1.u8 {d24, d25}, [r0], r1
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [r0], r1
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [r0], r1
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [r0], r1
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ subs r12, r12, #1
+ vst1.u8 {d4, d5}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d6, d7}, [r3]!
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r12, #8
+
+sub_pixel_variance16x16_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q2}, [r4], r5
+ vld1.8 {q1}, [r3]!
+ vld1.8 {q3}, [r4], r5
+
+ vsubl.u8 q11, d0, d4 ;diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne sub_pixel_variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r6] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ add sp, sp, #528
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4-r6,pc}
+
+ ENDP
+
+;-----------------
+ AREA vp8e_bilinear_taps_dat, DATA, READWRITE ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_BilinearTaps_coeff_
+ DCD bilinear_taps_coeff
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
new file mode 100644
index 000000000..3d02d7c40
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -0,0 +1,571 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sub_pixel_variance16x16s_4_0_neon|
+ EXPORT |vp8_sub_pixel_variance16x16s_0_4_neon|
+ EXPORT |vp8_sub_pixel_variance16x16s_4_4_neon|
+ EXPORT |vp8_sub_pixel_variance16x16s_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp8_sub_pixel_variance16x16s_4_0_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.8 {q11}, [r2], r3
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.8 {q12}, [r2], r3
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.8 {q13}, [r2], r3
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vext.8 q3, q2, q3, #1
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vld1.8 {q14}, [r2], r3
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+
+ vsubl.u8 q4, d0, d22 ;diff
+ vsubl.u8 q5, d1, d23
+ vsubl.u8 q6, d2, d24
+ vsubl.u8 q7, d3, d25
+ vsubl.u8 q0, d4, d26
+ vsubl.u8 q1, d5, d27
+ vsubl.u8 q2, d6, d28
+ vsubl.u8 q3, d7, d29
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_fpo16x16s_4_0_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp8_sub_pixel_variance16x16s_0_4_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+
+ vld1.u8 {q0}, [r0], r1 ;load src data
+ ldr lr, [sp, #4] ;load *sse from stack
+
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+ vld1.u8 {q2}, [r0], r1
+ vld1.8 {q1}, [r2], r3
+ vld1.u8 {q4}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+ vld1.u8 {q6}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+ vld1.u8 {q15}, [r0], r1
+
+ vrhadd.u8 q0, q0, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q4
+ vrhadd.u8 q4, q4, q6
+ vrhadd.u8 q6, q6, q15
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+
+ vmov q0, q15
+
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_spo16x16s_0_4_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp8_sub_pixel_variance16x16s_4_4_neon| PROC
+ push {lr}
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q13, #0 ;q8 - sum
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+
+ vmov.i8 q14, #0 ;q9, q10 - sse
+ vmov.i8 q15, #0
+
+ mov r12, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vld1.8 {q5}, [r2], r3
+ vrhadd.u8 q0, q0, q1
+ vld1.8 {q6}, [r2], r3
+ vrhadd.u8 q1, q1, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q3
+ vld1.8 {q8}, [r2], r3
+ vrhadd.u8 q3, q3, q4
+
+ vsubl.u8 q9, d0, d10 ;diff
+ vsubl.u8 q10, d1, d11
+ vsubl.u8 q11, d2, d12
+ vsubl.u8 q12, d3, d13
+
+ vsubl.u8 q0, d4, d14 ;diff
+ vsubl.u8 q1, d5, d15
+ vsubl.u8 q5, d6, d16
+ vsubl.u8 q6, d7, d17
+
+ vpadal.s16 q13, q9 ;sum
+ vmlal.s16 q14, d18, d18 ;sse
+ vmlal.s16 q15, d19, d19
+
+ vpadal.s16 q13, q10 ;sum
+ vmlal.s16 q14, d20, d20 ;sse
+ vmlal.s16 q15, d21, d21
+
+ vpadal.s16 q13, q11 ;sum
+ vmlal.s16 q14, d22, d22 ;sse
+ vmlal.s16 q15, d23, d23
+
+ vpadal.s16 q13, q12 ;sum
+ vmlal.s16 q14, d24, d24 ;sse
+ vmlal.s16 q15, d25, d25
+
+ subs r12, r12, #1
+
+ vpadal.s16 q13, q0 ;sum
+ vmlal.s16 q14, d0, d0 ;sse
+ vmlal.s16 q15, d1, d1
+
+ vpadal.s16 q13, q1 ;sum
+ vmlal.s16 q14, d2, d2 ;sse
+ vmlal.s16 q15, d3, d3
+
+ vpadal.s16 q13, q5 ;sum
+ vmlal.s16 q14, d10, d10 ;sse
+ vmlal.s16 q15, d11, d11
+
+ vmov q0, q4
+
+ vpadal.s16 q13, q6 ;sum
+ vmlal.s16 q14, d12, d12 ;sse
+ vmlal.s16 q15, d13, d13
+
+ bne vp8_filt16x16s_4_4_loop_neon
+
+ vadd.u32 q15, q14, q15 ;accumulate sse
+ vpaddl.s32 q0, q13 ;accumulate sum
+
+ vpaddl.u32 q1, q15
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;==============================
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp8_sub_pixel_variance16x16s_neon| PROC
+ push {r4, lr}
+
+ ldr r4, [sp, #8] ;load *dst_ptr from stack
+ ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #16] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16s_only
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq firstpass_bfilter16x16s_only
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ mov r3, sp
+ mov r2, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q1, q1, q2
+ vrhadd.u8 q2, q2, q3
+ vrhadd.u8 q3, q3, q4
+
+ subs r2, r2, #1
+ vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
+ vmov q0, q4
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+
+ bne vp8e_filt_blk2d_fp16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+ mov r2, #2 ;loop counter
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+ vext.8 q3, q2, q3, #1
+ vld1.u8 {d20, d21, d22, d23}, [r0], r1
+ vext.8 q5, q4, q5, #1
+ vld1.u8 {d24, d25, d26, d27}, [r0], r1
+ vext.8 q7, q6, q7, #1
+ vld1.u8 {d28, d29, d30, d31}, [r0], r1
+ vext.8 q9, q8, q9, #1
+ vext.8 q11, q10, q11, #1
+ vext.8 q13, q12, q13, #1
+ vext.8 q15, q14, q15, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+ vrhadd.u8 q5, q10, q11
+ vrhadd.u8 q6, q12, q13
+ vrhadd.u8 q7, q14, q15
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]!
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+
+ mov r2, #2 ;loop counter
+ vld1.u8 {d0, d1}, [r0], r1 ;load src data
+ mov r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+ vld1.u8 {d2, d3}, [r0], r1
+ vld1.u8 {d4, d5}, [r0], r1
+ vld1.u8 {d6, d7}, [r0], r1
+ vld1.u8 {d8, d9}, [r0], r1
+
+ vrhadd.u8 q0, q0, q1
+ vld1.u8 {d10, d11}, [r0], r1
+ vrhadd.u8 q1, q1, q2
+ vld1.u8 {d12, d13}, [r0], r1
+ vrhadd.u8 q2, q2, q3
+ vld1.u8 {d14, d15}, [r0], r1
+ vrhadd.u8 q3, q3, q4
+ vld1.u8 {d16, d17}, [r0], r1
+ vrhadd.u8 q4, q4, q5
+ vrhadd.u8 q5, q5, q6
+ vrhadd.u8 q6, q6, q7
+ vrhadd.u8 q7, q7, q8
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vmov q0, q8
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q1}, [r4], r12
+ vld1.8 {q2}, [r3]!
+ vld1.8 {q3}, [r4], r12
+ vld1.8 {q4}, [r3]!
+ vld1.8 {q5}, [r4], r12
+ vld1.8 {q6}, [r3]!
+ vld1.8 {q7}, [r4], r12
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r2, r2, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne sub_pixel_variance16x16s_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ add sp, sp, #256
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4, pc}
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
new file mode 100644
index 000000000..bd56761fa
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -0,0 +1,226 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sub_pixel_variance8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
+
+|vp8_sub_pixel_variance8x8_neon| PROC
+ push {r4-r5, lr}
+
+ ldr r12, _BilinearTaps_coeff_
+ ldr r4, [sp, #12] ;load *dst_ptr from stack
+ ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #20] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vld1.u8 {q2}, [r0], r1
+ vqrshrn.u16 d23, q7, #7
+ vld1.u8 {q3}, [r0], r1
+ vqrshrn.u16 d24, q8, #7
+ vld1.u8 {q4}, [r0], r1
+ vqrshrn.u16 d25, q9, #7
+
+ ;first_pass filtering on the rest 5-line data
+ vld1.u8 {q5}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d27, q7, #7
+ vqrshrn.u16 d28, q8, #7
+ vqrshrn.u16 d29, q9, #7
+ vqrshrn.u16 d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ ;skip_secondpass_filter
+ beq sub_pixel_variance8x8_neon
+
+ add r3, r12, r3, lsl #3
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+ vmlal.u8 q5, d27, d1
+ vmlal.u8 q6, d28, d1
+ vmlal.u8 q7, d29, d1
+ vmlal.u8 q8, d30, d1
+
+ vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d23, q2, #7
+ vqrshrn.u16 d24, q3, #7
+ vqrshrn.u16 d25, q4, #7
+ vqrshrn.u16 d26, q5, #7
+ vqrshrn.u16 d27, q6, #7
+ vqrshrn.u16 d28, q7, #7
+ vqrshrn.u16 d29, q8, #7
+
+ b sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+ vld1.u8 {d27}, [r0], r1
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+ b secondpass_filter
+
+;----------------------
+;vp8_variance8x8_neon
+sub_pixel_variance8x8_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+sub_pixel_variance8x8_neon_loop
+ vld1.8 {d0}, [r4], r5 ;load dst data
+ subs r12, r12, #1
+ vld1.8 {d1}, [r4], r5
+ vld1.8 {d2}, [r4], r5
+ vsubl.u8 q4, d22, d0 ;calculate diff
+ vld1.8 {d3}, [r4], r5
+
+ vsubl.u8 q5, d23, d1
+ vsubl.u8 q6, d24, d2
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ vsubl.u8 q7, d25, d3
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+
+ vmov q11, q13
+
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+
+ vmov q12, q14
+
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ bne sub_pixel_variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #6
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {r4-r5, pc}
+
+ ENDP
+
+;-----------------
+ AREA bilinear_taps_dat, DATA, READWRITE ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_BilinearTaps_coeff_
+ DCD bilinear_taps_coeff
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/vp8/encoder/arm/picklpf_arm.c b/vp8/encoder/arm/picklpf_arm.c
new file mode 100644
index 000000000..0586e55d8
--- /dev/null
+++ b/vp8/encoder/arm/picklpf_arm.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "alloccommon.h"
+
+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+
+
+void
+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
+{
+ unsigned char *src_y, *dst_y;
+ int yheight;
+ int ystride;
+ int border;
+ int yoffset;
+ int linestocopy;
+
+ border = src_ybc->border;
+ yheight = src_ybc->y_height;
+ ystride = src_ybc->y_stride;
+
+ linestocopy = (yheight >> (Fraction + 4));
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+ yoffset = ystride * ((yheight >> 5) * 16 - 8);
+ src_y = src_ybc->y_buffer + yoffset;
+ dst_y = dst_ybc->y_buffer + yoffset;
+
+ //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
+ vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16)));
+}
diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
new file mode 100644
index 000000000..46906d3a2
--- /dev/null
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "quantize.h"
+#include "entropy.h"
+#include "predictdc.h"
+
+DECLARE_ALIGNED(16, const short, vp8_rvsplus1_default_zig_zag1d[16]) =
+{
+ 1, 2, 6, 7,
+ 3, 5, 8, 13,
+ 4, 9, 12, 14,
+ 10, 11, 15, 16,
+};
+
+
+extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dqcoeff_ptr, short *dequant_ptr, const short *scan_mask, short *round_ptr, short *quant_ptr);
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
+{
+ d->eob = vp8_fast_quantize_b_neon_func(b->coeff, &b->zbin[0][0], d->qcoeff, d->dqcoeff, d->dequant[0], vp8_rvsplus1_default_zig_zag1d, &b->round[0][0], &b->quant[0][0]);
+}
+
+/*
+//neon code is written according to the following rewritten c code
+void vp8_fast_quantize_b_neon(BLOCK *b,BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, x1, y, z, sz;
+ short *coeff_ptr = &b->Coeff[0];
+ short *zbin_ptr = &b->Zbin[0][0];
+ short *round_ptr = &b->Round[0][0];
+ short *quant_ptr = &b->Quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr= d->dqcoeff;
+ short *dequant_ptr= &d->Dequant[0][0];
+
+ eob = 0;
+
+ for(i=0;i<16;i++)
+ {
+ z = coeff_ptr[i];
+ zbin = zbin_ptr[i] ;
+ x = abs(z); // x = abs(z)
+
+ if(x>=zbin)
+ {
+ sz = (z>>31); // sign of z
+ y = ((x+round_ptr[i])*quant_ptr[i])>>16; // quantize (x)
+ x1 = (y^sz) - sz; // get the sign back
+
+ qcoeff_ptr[i] = x1; // write to destination
+ dqcoeff_ptr[i] = x1 * dequant_ptr[i]; // dequantized value
+
+ if(y)
+ {
+ if(eob<vp8_rvsplus1_default_zig_zag1d[i])
+ eob=(int)vp8_rvsplus1_default_zig_zag1d[i]; // last nonzero coeffs
+ }
+ }else
+ {
+ qcoeff_ptr[i] = 0; // write to destination
+ dqcoeff_ptr[i] = 0; // dequantized value
+ }
+ }
+ d->eob = eob;
+}
+*/
diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h
new file mode 100644
index 000000000..e93f0fef1
--- /dev/null
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef QUANTIZE_ARM_H
+#define QUANTIZE_ARM_H
+
+#if HAVE_ARMV7
+extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+
+#undef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+
+#endif
+
+#endif
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
new file mode 100644
index 000000000..d9fc9b3e0
--- /dev/null
+++ b/vp8/encoder/arm/variance_arm.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_ARM_H
+#define VARIANCE_ARM_H
+
+#if HAVE_ARMV7
+extern prototype_sad(vp8_sad4x4_neon);
+extern prototype_sad(vp8_sad8x8_neon);
+extern prototype_sad(vp8_sad8x16_neon);
+extern prototype_sad(vp8_sad16x8_neon);
+extern prototype_sad(vp8_sad16x16_neon);
+
+//extern prototype_variance(vp8_variance4x4_c);
+extern prototype_variance(vp8_variance8x8_neon);
+extern prototype_variance(vp8_variance8x16_neon);
+extern prototype_variance(vp8_variance16x8_neon);
+extern prototype_variance(vp8_variance16x16_neon);
+
+//extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_c);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
+//extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
+//extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+
+//extern prototype_getmbss(vp8_get_mb_ss_c);
+extern prototype_variance(vp8_mse16x16_neon);
+extern prototype_sad(vp8_get16x16pred_error_neon);
+//extern prototype_variance2(vp8_get8x8var_c);
+//extern prototype_variance2(vp8_get16x16var_c);
+extern prototype_sad(vp8_get4x4sse_cs_neon);
+
+#undef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_neon
+
+#undef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_neon
+
+#undef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_neon
+
+#undef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_neon
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_neon
+
+//#undef vp8_variance_var4x4
+//#define vp8_variance_var4x4 vp8_variance4x4_c
+
+#undef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_neon
+
+#undef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_neon
+
+#undef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_neon
+
+#undef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_neon
+
+//#undef vp8_variance_subpixvar4x4
+//#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
+
+#undef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_neon
+
+//#undef vp8_variance_subpixvar8x16
+//#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
+
+//#undef vp8_variance_subpixvar16x8
+//#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
+
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon
+
+//#undef vp8_variance_getmbss
+//#define vp8_variance_getmbss vp8_get_mb_ss_c
+
+#undef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_neon
+
+#undef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon
+
+//#undef vp8_variance_get8x8var
+//#define vp8_variance_get8x8var vp8_get8x8var_c
+
+//#undef vp8_variance_get16x16var
+//#define vp8_variance_get16x16var vp8_get16x16var_c
+
+#undef vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
+
+#endif
+
+#endif
diff --git a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
new file mode 100644
index 000000000..8cdf0791f
--- /dev/null
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "../treewriter.h"
+#include "../tokenize.h"
+#include "../onyx_int.h"
+
+#define ct_assert(name,cond) \
+ static void assert_##name(void) UNUSED;\
+ static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue));
+DEFINE(vp8_writer_range, offsetof(vp8_writer, range));
+DEFINE(vp8_writer_value, offsetof(vp8_writer, value));
+DEFINE(vp8_writer_count, offsetof(vp8_writer, count));
+DEFINE(vp8_writer_pos, offsetof(vp8_writer, pos));
+DEFINE(vp8_writer_buffer, offsetof(vp8_writer, buffer));
+
+DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree, offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node, offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ, sizeof(TOKENEXTRA));
+
+DEFINE(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct));
+
+DEFINE(vp8_token_value, offsetof(vp8_token, value));
+DEFINE(vp8_token_len, offsetof(vp8_token, Len));
+
+DEFINE(vp8_extra_bit_struct_tree, offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob, offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_prob_bc, offsetof(vp8_extra_bit_struct, prob_bc));
+DEFINE(vp8_extra_bit_struct_len, offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val, offsetof(vp8_extra_bit_struct, base_val));
+
+DEFINE(vp8_comp_tplist, offsetof(VP8_COMP, tplist));
+DEFINE(vp8_comp_common, offsetof(VP8_COMP, common));
+DEFINE(vp8_comp_bc2, offsetof(VP8_COMP, bc2));
+
+DEFINE(tokenlist_start, offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST));
+
+DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows));
+
+// These two sizes are used in vp7cx_pack_tokens. They are hard coded
+// so if the size changes this will have to be adjusted.
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20)
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
new file mode 100644
index 000000000..31ad56a2a
--- /dev/null
+++ b/vp8/encoder/bitstream.c
@@ -0,0 +1,1719 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "header.h"
+#include "encodemv.h"
+#include "entropymode.h"
+#include "findnearmv.h"
+#include "mcomp.h"
+#include "systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include "pragmas.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+
+const int vp8cx_base_skip_false_prob[128] =
+{
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 251, 248, 244, 240, 236, 232, 229, 225,
+ 221, 217, 213, 208, 204, 199, 194, 190,
+ 187, 183, 179, 175, 172, 168, 164, 160,
+ 157, 153, 149, 145, 142, 138, 134, 130,
+ 127, 124, 120, 117, 114, 110, 107, 104,
+ 101, 98, 95, 92, 89, 86, 83, 80,
+ 77, 74, 71, 68, 65, 62, 59, 56,
+ 53, 50, 47, 44, 41, 38, 35, 32,
+ 30, 28, 26, 24, 22, 20, 18, 16,
+};
+#ifdef VP8REF
+#define __int64 long long
+#endif
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats[10][10][10];
+static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] [2];
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+#if CONFIG_BIG_ENDIAN
+# define make_endian_16(a) \
+ (((unsigned int)(a & 0xff)) << 8) | (((unsigned int)(a & 0xff00)) >> 8)
+# define make_endian_32(a) \
+ (((unsigned int)(a & 0xff)) << 24) | (((unsigned int)(a & 0xff00)) << 8) | \
+ (((unsigned int)(a & 0xff0000)) >> 8) | (((unsigned int)(a & 0xff000000)) >> 24)
+#else
+# define make_endian_16(a) a
+# define make_endian_32(a) a
+#endif
+
+static void update_mode(
+ vp8_writer *const w,
+ int n,
+ vp8_token tok [/* n */],
+ vp8_tree tree,
+ vp8_prob Pnew [/* n-1 */],
+ vp8_prob Pcur [/* n-1 */],
+ unsigned int bct [/* n-1 */] [2],
+ const unsigned int num_events[/* n */]
+)
+{
+ unsigned int new_b = 0, old_b = 0;
+ int i = 0;
+
+ vp8_tree_probs_from_distribution(
+ n--, tok, tree,
+ Pnew, bct, num_events,
+ 256, 1
+ );
+
+ do
+ {
+ new_b += vp8_cost_branch(bct[i], Pnew[i]);
+ old_b += vp8_cost_branch(bct[i], Pcur[i]);
+ }
+ while (++i < n);
+
+ if (new_b + (n << 8) < old_b)
+ {
+ int i = 0;
+
+ vp8_write_bit(w, 1);
+
+ do
+ {
+ const vp8_prob p = Pnew[i];
+
+ vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+ }
+ while (++i < n);
+ }
+ else
+ vp8_write_bit(w, 0);
+}
+
+static void update_mbintra_mode_probs(VP8_COMP *cpi)
+{
+ VP8_COMMON *const x = & cpi->common;
+
+ vp8_writer *const w = & cpi->bc;
+
+ {
+ vp8_prob Pnew [VP8_YMODES-1];
+ unsigned int bct [VP8_YMODES-1] [2];
+
+ update_mode(
+ w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+ Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+ );
+ }
+ {
+ vp8_prob Pnew [VP8_UV_MODES-1];
+ unsigned int bct [VP8_UV_MODES-1] [2];
+
+ update_mode(
+ w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+ Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->uv_mode_count
+ );
+ }
+}
+
+static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
+}
+
+static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m);
+}
+
+static void write_split(vp8_writer *bc, int x)
+{
+ vp8_write_token(
+ bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x
+ );
+}
+
+static const unsigned int norm[256] =
+{
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+{
+ const TOKENEXTRA *const stop = p + xcount;
+ unsigned int split;
+ unsigned int shift;
+ int count = w->count;
+ unsigned int range = w->range;
+ unsigned int lowvalue = w->lowvalue;
+
+ while (p < stop)
+ {
+ const int t = p->Token;
+ vp8_token *const a = vp8_coef_encodings + t;
+ const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+ int i = 0;
+ const unsigned char *pp = p->context_tree;
+ int v = a->value;
+ int n = a->Len;
+
+ if (p->skip_eob_node)
+ {
+ n--;
+ i = 2;
+ }
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = vp8_coef_tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+
+
+ if (b->base_val)
+ {
+ const int e = p->Extra, L = b->Len;
+
+ if (L)
+ {
+ const unsigned char *pp = b->prob;
+ int v = e >> 1;
+ int n = L; /* number of bits in v, assumed nonzero */
+ int i = 0;
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = b->tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+ }
+
+
+ {
+
+ split = (range + 1) >> 1;
+
+ if (e & 1)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ range <<= 1;
+
+ if ((lowvalue & 0x80000000))
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+
+ }
+
+ lowvalue <<= 1;
+
+ if (!++count)
+ {
+ count = -8;
+ w->buffer[w->pos++] = (lowvalue >> 24);
+ lowvalue &= 0xffffff;
+ }
+ }
+
+ }
+
+ ++p;
+ }
+
+ w->count = count;
+ w->lowvalue = lowvalue;
+ w->range = range;
+
+}
+
+static void write_partition_size(unsigned char *cx_data, int size)
+{
+ signed char csize;
+
+ csize = size & 0xff;
+ *cx_data = csize;
+ csize = (size >> 8) & 0xff;
+ *(cx_data + 1) = csize;
+ csize = (size >> 16) & 0xff;
+ *(cx_data + 2) = csize;
+
+}
+
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, int num_part, int *size)
+{
+
+ int i;
+ unsigned char *ptr = cx_data;
+ unsigned int shift;
+ vp8_writer *w = &cpi->bc2;
+ *size = 3 * (num_part - 1);
+ ptr = cx_data + (*size);
+
+ for (i = 0; i < num_part; i++)
+ {
+ vp8_start_encode(w, ptr);
+ {
+ unsigned int split;
+ int count = w->count;
+ unsigned int range = w->range;
+ unsigned int lowvalue = w->lowvalue;
+ int mb_row;
+
+ for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+ {
+ TOKENEXTRA *p = cpi->tplist[mb_row].start;
+ TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+
+ while (p < stop)
+ {
+ const int t = p->Token;
+ vp8_token *const a = vp8_coef_encodings + t;
+ const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+ int i = 0;
+ const unsigned char *pp = p->context_tree;
+ int v = a->value;
+ int n = a->Len;
+
+ if (p->skip_eob_node)
+ {
+ n--;
+ i = 2;
+ }
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = vp8_coef_tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+
+
+ if (b->base_val)
+ {
+ const int e = p->Extra, L = b->Len;
+
+ if (L)
+ {
+ const unsigned char *pp = b->prob;
+ int v = e >> 1;
+ int n = L; /* number of bits in v, assumed nonzero */
+ int i = 0;
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = b->tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+ }
+
+ {
+ split = (range + 1) >> 1;
+
+ if (e & 1)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ range <<= 1;
+
+ if ((lowvalue & 0x80000000))
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+
+ }
+
+ lowvalue <<= 1;
+
+ if (!++count)
+ {
+ count = -8;
+ w->buffer[w->pos++] = (lowvalue >> 24);
+ lowvalue &= 0xffffff;
+ }
+ }
+
+ }
+
+ ++p;
+ }
+ }
+
+ w->count = count;
+ w->lowvalue = lowvalue;
+ w->range = range;
+
+ }
+
+ vp8_stop_encode(w);
+ *size += w->pos;
+
+ if (i < (num_part - 1))
+ {
+ write_partition_size(cx_data, w->pos);
+ cx_data += 3;
+ ptr += w->pos;
+ }
+ }
+}
+
+
+static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+{
+
+ unsigned int split;
+ int count = w->count;
+ unsigned int range = w->range;
+ unsigned int lowvalue = w->lowvalue;
+ unsigned int shift;
+ int mb_row;
+
+ for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
+ {
+ TOKENEXTRA *p = cpi->tplist[mb_row].start;
+ TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+
+ while (p < stop)
+ {
+ const int t = p->Token;
+ vp8_token *const a = vp8_coef_encodings + t;
+ const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+ int i = 0;
+ const unsigned char *pp = p->context_tree;
+ int v = a->value;
+ int n = a->Len;
+
+ if (p->skip_eob_node)
+ {
+ n--;
+ i = 2;
+ }
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = vp8_coef_tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+
+
+ if (b->base_val)
+ {
+ const int e = p->Extra, L = b->Len;
+
+ if (L)
+ {
+ const unsigned char *pp = b->prob;
+ int v = e >> 1;
+ int n = L; /* number of bits in v, assumed nonzero */
+ int i = 0;
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = b->tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+ }
+
+ {
+ split = (range + 1) >> 1;
+
+ if (e & 1)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ range <<= 1;
+
+ if ((lowvalue & 0x80000000))
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+
+ }
+
+ lowvalue <<= 1;
+
+ if (!++count)
+ {
+ count = -8;
+ w->buffer[w->pos++] = (lowvalue >> 24);
+ lowvalue &= 0xffffff;
+ }
+ }
+
+ }
+
+ ++p;
+ }
+ }
+
+ w->count = count;
+ w->lowvalue = lowvalue;
+ w->range = range;
+
+}
+
+static void write_mv_ref
+(
+ vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+
+ assert(NEARESTMV <= m && m <= SPLITMV);
+
+ vp8_write_token(w, vp8_mv_ref_tree, p, VP8_MVREFENCODINGS + m);
+}
+
+static void write_sub_mv_ref
+(
+ vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+ assert(LEFT4X4 <= m && m <= NEW4X4);
+
+ vp8_write_token(w, vp8_sub_mv_ref_tree, p, VP8_SUBMVREFENCODINGS + m);
+}
+
+static void write_mv
+(
+ vp8_writer *w, const MV *mv, const MV *ref, const MV_CONTEXT *mvc
+)
+{
+ MV e;
+ e.row = mv->row - ref->row;
+ e.col = mv->col - ref->col;
+
+ vp8_encode_motion_vector(w, &e, mvc);
+}
+
+static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
+{
+ // Encode the MB segment id.
+ if (x->segmentation_enabled && x->update_mb_segmentation_map)
+ {
+ switch (mi->segment_id)
+ {
+ case 0:
+ vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+ break;
+ case 1:
+ vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+ break;
+ case 2:
+ vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+ break;
+ case 3:
+ vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+ break;
+
+ // TRAP.. This should not happen
+ default:
+ vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+ break;
+ }
+ }
+}
+
+
+static void pack_inter_mode_mvs(VP8_COMP *const cpi)
+{
+ VP8_COMMON *const pc = & cpi->common;
+ vp8_writer *const w = & cpi->bc;
+ const MV_CONTEXT *mvc = pc->fc.mvc;
+
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ MODE_INFO *m = pc->mi, *ms;
+ const int mis = pc->mode_info_stride;
+ int mb_row = -1;
+
+ int prob_last_coded;
+ int prob_gf_coded;
+ int prob_skip_false = 0;
+ ms = pc->mi - 1;
+
+ // Calculate the probabilities to be used to code the reference frame based on actual useage this frame
+ if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+ cpi->prob_intra_coded = 1;
+
+ prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ if (!prob_last_coded)
+ prob_last_coded = 1;
+
+ prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ if (!prob_gf_coded)
+ prob_gf_coded = 1;
+
+
+#ifdef ENTROPY_STATS
+ active_section = 1;
+#endif
+
+ if (pc->mb_no_coeff_skip)
+ {
+ prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+
+ if (prob_skip_false <= 1)
+ prob_skip_false = 1;
+
+ if (prob_skip_false > 255)
+ prob_skip_false = 255;
+
+ cpi->prob_skip_false = prob_skip_false;
+ vp8_write_literal(w, prob_skip_false, 8);
+ }
+
+ vp8_write_literal(w, cpi->prob_intra_coded, 8);
+ vp8_write_literal(w, prob_last_coded, 8);
+ vp8_write_literal(w, prob_gf_coded, 8);
+
+ update_mbintra_mode_probs(cpi);
+
+ vp8_write_mvprobs(cpi);
+
+ while (++mb_row < pc->mb_rows)
+ {
+ int mb_col = -1;
+
+ while (++mb_col < pc->mb_cols)
+ {
+ const MB_MODE_INFO *const mi = & m->mbmi;
+ const MV_REFERENCE_FRAME rf = mi->ref_frame;
+ const MB_PREDICTION_MODE mode = mi->mode;
+
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ // Distance of Mb to the various image edges.
+ // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+#ifdef ENTROPY_STATS
+ active_section = 9;
+#endif
+
+ if (cpi->mb.e_mbd.update_mb_segmentation_map)
+ write_mb_features(w, mi, &cpi->mb.e_mbd);
+
+ if (pc->mb_no_coeff_skip)
+ vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+ if (rf == INTRA_FRAME)
+ {
+ vp8_write(w, 0, cpi->prob_intra_coded);
+#ifdef ENTROPY_STATS
+ active_section = 6;
+#endif
+ write_ymode(w, mode, pc->fc.ymode_prob);
+
+ if (mode == B_PRED)
+ {
+ int j = 0;
+
+ do
+ write_bmode(w, m->bmi[j].mode, pc->fc.bmode_prob);
+
+ while (++j < 16);
+ }
+
+ write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
+ }
+ else /* inter coded */
+ {
+ MV best_mv;
+ vp8_prob mv_ref_p [VP8_MVREFS-1];
+
+ vp8_write(w, 1, cpi->prob_intra_coded);
+
+ if (rf == LAST_FRAME)
+ vp8_write(w, 0, prob_last_coded);
+ else
+ {
+ vp8_write(w, 1, prob_last_coded);
+ vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded);
+ }
+
+ {
+ MV n1, n2;
+ int ct[4];
+
+ vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+ vp8_mv_ref_probs(mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+ accum_mv_refs(mode, ct);
+#endif
+
+ }
+
+#ifdef ENTROPY_STATS
+ active_section = 3;
+#endif
+
+ write_mv_ref(w, mode, mv_ref_p);
+
+ switch (mode) /* new, split require MVs */
+ {
+ case NEWMV:
+
+#ifdef ENTROPY_STATS
+ active_section = 5;
+#endif
+
+ write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+ break;
+
+ case SPLITMV:
+ {
+ int j = 0;
+
+#ifdef MODE_STATS
+ ++count_mb_seg [mi->partitioning];
+#endif
+
+ write_split(w, mi->partitioning);
+
+ do
+ {
+ const B_MODE_INFO *const b = mi->partition_bmi + j;
+ const int *const L = vp8_mbsplits [mi->partitioning];
+ int k = -1; /* first block in subset j */
+ int mv_contz;
+
+ while (j != L[++k])
+ if (k >= 16)
+ assert(0);
+
+ mv_contz = vp8_mv_cont
+ (&(vp8_left_bmi(m, k)->mv.as_mv),
+ &(vp8_above_bmi(m, k, mis)->mv.as_mv));
+ write_sub_mv_ref(w, b->mode, vp8_sub_mv_ref_prob2 [mv_contz]); //pc->fc.sub_mv_ref_prob);
+
+ if (b->mode == NEW4X4)
+ {
+#ifdef ENTROPY_STATS
+ active_section = 11;
+#endif
+ write_mv(w, &b->mv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);
+ }
+ }
+ while (++j < mi->partition_count);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ ++m;
+ }
+
+ ++m; /* skip L prediction border */
+ }
+}
+
+
+static void write_kfmodes(VP8_COMP *cpi)
+{
+ vp8_writer *const bc = & cpi->bc;
+ const VP8_COMMON *const c = & cpi->common;
+ /* const */
+ MODE_INFO *m = c->mi;
+
+ int mb_row = -1;
+ int prob_skip_false = 0;
+
+ if (c->mb_no_coeff_skip)
+ {
+ prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+
+ if (prob_skip_false <= 1)
+ prob_skip_false = 1;
+
+ if (prob_skip_false >= 255)
+ prob_skip_false = 255;
+
+ cpi->prob_skip_false = prob_skip_false;
+ vp8_write_literal(bc, prob_skip_false, 8);
+ }
+
+ while (++mb_row < c->mb_rows)
+ {
+ int mb_col = -1;
+
+ while (++mb_col < c->mb_cols)
+ {
+ const int ym = m->mbmi.mode;
+
+ if (cpi->mb.e_mbd.update_mb_segmentation_map)
+ write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+
+ if (c->mb_no_coeff_skip)
+ vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+ kfwrite_ymode(bc, ym, c->kf_ymode_prob);
+
+ if (ym == B_PRED)
+ {
+ const int mis = c->mode_info_stride;
+ int i = 0;
+
+ do
+ {
+ const B_PREDICTION_MODE A = vp8_above_bmi(m, i, mis)->mode;
+ const B_PREDICTION_MODE L = vp8_left_bmi(m, i)->mode;
+ const int bm = m->bmi[i].mode;
+
+#ifdef ENTROPY_STATS
+ ++intra_mode_stats [A] [L] [bm];
+#endif
+
+ write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
+ }
+ while (++i < 16);
+ }
+
+ write_uv_mode(bc, (m++)->mbmi.uv_mode, c->kf_uv_mode_prob);
+ }
+
+ m++; // skip L prediction border
+ }
+}
+int vp8_estimate_entropy_savings(VP8_COMP *cpi)
+{
+ int i = 0;
+ int savings = 0;
+
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+ int new_intra, new_last, gf_last, oldtotal, newtotal;
+ int ref_frame_cost[MAX_REF_FRAMES];
+
+ vp8_clear_system_state(); //__asm emms;
+
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter)))
+ new_intra = 1;
+
+ new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ // new costs
+ ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(new_intra);
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(new_intra)
+ + vp8_cost_zero(new_last);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(new_intra)
+ + vp8_cost_one(new_last)
+ + vp8_cost_zero(gf_last);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(new_intra)
+ + vp8_cost_one(new_last)
+ + vp8_cost_one(gf_last);
+
+ newtotal =
+ rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+ rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+ rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+ rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+
+ // old costs
+ ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded);
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(cpi->prob_last_coded);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(cpi->prob_last_coded)
+ + vp8_cost_zero(cpi->prob_gf_coded);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(cpi->prob_last_coded)
+ + vp8_cost_one(cpi->prob_gf_coded);
+
+ oldtotal =
+ rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+ rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+ rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+ rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+ savings += (oldtotal - newtotal) / 256;
+ }
+
+
+ do
+ {
+ int j = 0;
+
+ do
+ {
+ int k = 0;
+
+ do
+ {
+ /* at every context */
+
+ /* calc probs and branch cts for this frame only */
+ //vp8_prob new_p [vp8_coef_tokens-1];
+ //unsigned int branch_ct [vp8_coef_tokens-1] [2];
+
+ int t = 0; /* token/prob index */
+
+ vp8_tree_probs_from_distribution(
+ vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
+ cpi->frame_coef_probs [i][j][k], cpi->frame_branch_ct [i][j][k], cpi->coef_counts [i][j][k],
+ 256, 1
+ );
+
+ do
+ {
+ const unsigned int *ct = cpi->frame_branch_ct [i][j][k][t];
+ const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+ const vp8_prob old = cpi->common.fc.coef_probs [i][j][k][t];
+ const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+ const int old_b = vp8_cost_branch(ct, old);
+ const int new_b = vp8_cost_branch(ct, newp);
+
+ const int update_b = 8 +
+ ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+ const int s = old_b - new_b - update_b;
+
+ if (s > 0)
+ savings += s;
+
+
+ }
+ while (++t < vp8_coef_tokens - 1);
+
+
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+
+ return savings;
+}
+
+static void update_coef_probs(VP8_COMP *cpi)
+{
+ int i = 0;
+ vp8_writer *const w = & cpi->bc;
+ int savings = 0;
+
+ vp8_clear_system_state(); //__asm emms;
+
+
+ do
+ {
+ int j = 0;
+
+ do
+ {
+ int k = 0;
+
+ do
+ {
+ //note: use result from vp8_estimate_entropy_savings, so no need to call vp8_tree_probs_from_distribution here.
+ /* at every context */
+
+ /* calc probs and branch cts for this frame only */
+ //vp8_prob new_p [vp8_coef_tokens-1];
+ //unsigned int branch_ct [vp8_coef_tokens-1] [2];
+
+ int t = 0; /* token/prob index */
+
+ //vp8_tree_probs_from_distribution(
+ // vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
+ // new_p, branch_ct, (unsigned int *)cpi->coef_counts [i][j][k],
+ // 256, 1
+ // );
+
+ do
+ {
+ const unsigned int *ct = cpi->frame_branch_ct [i][j][k][t];
+ const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+ vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
+ const vp8_prob old = *Pold;
+ const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+ const int old_b = vp8_cost_branch(ct, old);
+ const int new_b = vp8_cost_branch(ct, newp);
+
+ const int update_b = 8 +
+ ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+ const int s = old_b - new_b - update_b;
+ const int u = s > 0 ? 1 : 0;
+
+ vp8_write(w, u, upd);
+
+
+#ifdef ENTROPY_STATS
+ ++ tree_update_hist [i][j][k][t] [u];
+#endif
+
+ if (u)
+ {
+ /* send/use new probability */
+
+ *Pold = newp;
+ vp8_write_literal(w, newp, 8);
+
+ savings += s;
+
+ }
+
+ }
+ while (++t < vp8_coef_tokens - 1);
+
+ /* Accum token counts for generation of default statistics */
+#ifdef ENTROPY_STATS
+ t = 0;
+
+ do
+ {
+ context_counters [i][j][k][t] += cpi->coef_counts [i][j][k][t];
+ }
+ while (++t < vp8_coef_tokens);
+
+#endif
+
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+
+}
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp8_writer *bc, int delta_q)
+{
+ if (delta_q != 0)
+ {
+ vp8_write_bit(bc, 1);
+ vp8_write_literal(bc, abs(delta_q), 4);
+
+ if (delta_q < 0)
+ vp8_write_bit(bc, 1);
+ else
+ vp8_write_bit(bc, 0);
+ }
+ else
+ vp8_write_bit(bc, 0);
+}
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
+{
+ int i, j;
+ VP8_HEADER oh;
+ VP8_COMMON *const pc = & cpi->common;
+ vp8_writer *const bc = & cpi->bc;
+ MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+ int extra_bytes_packed = 0;
+
+ unsigned char *cx_data = dest;
+ const int *mb_feature_data_bits;
+
+ oh.show_frame = (int) pc->show_frame;
+ oh.type = (int)pc->frame_type;
+ oh.version = pc->version;
+
+ mb_feature_data_bits = vp8_mb_feature_data_bits;
+ cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+ Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256;
+#endif
+
+ //vp8_kf_default_bmode_probs() is called in vp8_setup_key_frame() once for each
+ //K frame before encode frame. pc->kf_bmode_prob doesn't get changed anywhere
+ //else. No need to call it again here. --yw
+ //vp8_kf_default_bmode_probs( pc->kf_bmode_prob);
+
+ // every keyframe send startcode, width, height, scale factor, clamp and color type
+ if (oh.type == KEY_FRAME)
+ {
+ int w, h, hs, vs;
+
+ // Start / synch code
+ cx_data[0] = 0x9D;
+ cx_data[1] = 0x01;
+ cx_data[2] = 0x2a;
+
+ *((unsigned short *)(cx_data + 3)) = make_endian_16((pc->horiz_scale << 14) | pc->Width);
+ *((unsigned short *)(cx_data + 5)) = make_endian_16((pc->vert_scale << 14) | pc->Height);
+
+ extra_bytes_packed = 7;
+ cx_data += extra_bytes_packed ;
+
+ vp8_start_encode(bc, cx_data);
+
+ // signal clr type
+ vp8_write_bit(bc, pc->clr_type);
+ vp8_write_bit(bc, pc->clamp_type);
+
+ }
+ else
+ vp8_start_encode(bc, cx_data);
+
+
+ // Signal whether or not Segmentation is enabled
+ vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+
+ // Indicate which features are enabled
+ if (xd->segmentation_enabled)
+ {
+ // Signal whether or not the segmentation map is being updated.
+ vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
+ vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+
+ if (xd->update_mb_segmentation_data)
+ {
+ signed char Data;
+
+ vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0);
+
+ // For each segmentation feature (Quant and loop filter level)
+ for (i = 0; i < MB_LVL_MAX; i++)
+ {
+ // For each of the segments
+ for (j = 0; j < MAX_MB_SEGMENTS; j++)
+ {
+ Data = xd->segment_feature_data[i][j];
+
+ // Frame level data
+ if (Data)
+ {
+ vp8_write_bit(bc, 1);
+
+ if (Data < 0)
+ {
+ Data = - Data;
+ vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+ vp8_write_bit(bc, 1);
+ }
+ else
+ {
+ vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+ vp8_write_bit(bc, 0);
+ }
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+ }
+ }
+
+ if (xd->update_mb_segmentation_map)
+ {
+ // Write the probs used to decode the segment id for each macro block.
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+ {
+ int Data = xd->mb_segment_tree_probs[i];
+
+ if (Data != 255)
+ {
+ vp8_write_bit(bc, 1);
+ vp8_write_literal(bc, Data, 8);
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+ }
+ }
+
+ // Code to determine whether or not to update the scan order.
+ vp8_write_bit(bc, pc->filter_type);
+ vp8_write_literal(bc, pc->filter_level, 6);
+ vp8_write_literal(bc, pc->sharpness_level, 3);
+
+ // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
+ vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+
+ if (xd->mode_ref_lf_delta_enabled)
+ {
+ // Do the deltas need to be updated
+ vp8_write_bit(bc, (xd->mode_ref_lf_delta_update) ? 1 : 0);
+
+ if (xd->mode_ref_lf_delta_update)
+ {
+ int Data;
+
+ // Send update
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+ {
+ Data = xd->ref_lf_deltas[i];
+
+ // Frame level data
+ if (Data)
+ {
+ vp8_write_bit(bc, 1);
+
+ if (Data > 0)
+ {
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 0); // sign
+ }
+ else
+ {
+ Data = -Data;
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 1); // sign
+ }
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+
+ // Send update
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ {
+ Data = xd->mode_lf_deltas[i];
+
+ if (Data)
+ {
+ vp8_write_bit(bc, 1);
+
+ if (Data > 0)
+ {
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 0); // sign
+ }
+ else
+ {
+ Data = -Data;
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 1); // sign
+ }
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+ }
+ }
+
+ //signal here is multi token partition is enabled
+ vp8_write_literal(bc, pc->multi_token_partition, 2);
+
+ // Frame Qbaseline quantizer index
+ vp8_write_literal(bc, pc->base_qindex, 7);
+
+ // Transmit Dc, Second order and Uv quantizer delta information
+ put_delta_q(bc, pc->y1dc_delta_q);
+ put_delta_q(bc, pc->y2dc_delta_q);
+ put_delta_q(bc, pc->y2ac_delta_q);
+ put_delta_q(bc, pc->uvdc_delta_q);
+ put_delta_q(bc, pc->uvac_delta_q);
+
+ // When there is a key frame all reference buffers are updated using the new key frame
+ if (pc->frame_type != KEY_FRAME)
+ {
+ // Should the GF or ARF be updated using the transmitted frame or buffer
+ vp8_write_bit(bc, pc->refresh_golden_frame);
+ vp8_write_bit(bc, pc->refresh_alt_ref_frame);
+
+ // If not being updated from current frame should either GF or ARF be updated from another buffer
+ if (!pc->refresh_golden_frame)
+ vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
+
+ if (!pc->refresh_alt_ref_frame)
+ vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
+
+ // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
+ vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+ vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+ }
+
+ vp8_write_bit(bc, pc->refresh_entropy_probs);
+
+ if (pc->frame_type != KEY_FRAME)
+ vp8_write_bit(bc, pc->refresh_last_frame);
+
+#ifdef ENTROPY_STATS
+
+ if (pc->frame_type == INTER_FRAME)
+ active_section = 0;
+ else
+ active_section = 7;
+
+#endif
+
+ vp8_clear_system_state(); //__asm emms;
+
+ //************************************************
+ // save a copy for later refresh
+ {
+ vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+ }
+
+ update_coef_probs(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 2;
+#endif
+
+ // Write out the mb_no_coeff_skip flag
+ vp8_write_bit(bc, pc->mb_no_coeff_skip);
+
+ if (pc->frame_type == KEY_FRAME)
+ {
+ write_kfmodes(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 8;
+#endif
+ }
+ else
+ {
+ pack_inter_mode_mvs(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 1;
+#endif
+ }
+
+ vp8_stop_encode(bc);
+
+
+ if (pc->multi_token_partition != ONE_PARTITION)
+ {
+ int num_part;
+ int asize;
+ num_part = 1 << pc->multi_token_partition;
+
+ pack_tokens_into_partitions(cpi, cx_data + bc->pos, num_part, &asize);
+
+ oh.first_partition_length_in_bytes = cpi->bc.pos;
+
+ *size = cpi->bc.pos + VP8_HEADER_SIZE + asize + extra_bytes_packed;
+ }
+ else
+ {
+ vp8_start_encode(&cpi->bc2, cx_data + bc->pos);
+
+ if (!cpi->b_multi_threaded)
+ pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
+ else
+ pack_mb_row_tokens(cpi, &cpi->bc2);
+
+ vp8_stop_encode(&cpi->bc2);
+ oh.first_partition_length_in_bytes = cpi->bc.pos ;
+ *size = cpi->bc2.pos + cpi->bc.pos + VP8_HEADER_SIZE + extra_bytes_packed;
+ }
+
+#if CONFIG_BIG_ENDIAN
+ {
+ int v = (oh.first_partition_length_in_bytes << 5) |
+ (oh.show_frame << 4) |
+ (oh.version << 1) |
+ oh.type;
+
+ v = make_endian_32(v);
+ vpx_memcpy(dest, &v, 3);
+ }
+#else
+ vpx_memcpy(dest, &oh, 3);
+#endif
+}
+
+#ifdef ENTROPY_STATS
+void print_tree_update_probs()
+{
+ int i, j, k, l;
+ FILE *f = fopen("context.c", "a");
+ int Sum;
+ fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+ fprintf(f, "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] = {\n");
+
+ for (i = 0; i < BLOCK_TYPES; i++)
+ {
+ fprintf(f, " { \n");
+
+ for (j = 0; j < COEF_BANDS; j++)
+ {
+ fprintf(f, " {\n");
+
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+ {
+ fprintf(f, " {");
+
+ for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++)
+ {
+ Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
+
+ if (Sum > 0)
+ {
+ if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
+ fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
+ else
+ fprintf(f, "%3ld, ", 1);
+ }
+ else
+ fprintf(f, "%3ld, ", 128);
+ }
+
+ fprintf(f, "},\n");
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "};\n");
+ fclose(f);
+}
+#endif
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
new file mode 100644
index 000000000..ee69f66e4
--- /dev/null
+++ b/vp8/encoder/bitstream.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BITSTREAM_H
+#define __INC_BITSTREAM_H
+
+#if HAVE_ARMV7
+void vp8cx_pack_tokens_armv7(vp8_writer *w, const TOKENEXTRA *p, int xcount,
+ vp8_token *,
+ vp8_extra_bit_struct *,
+ const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv7(VP8_COMP *, unsigned char *, int , int *,
+ vp8_token *,
+ vp8_extra_bit_struct *,
+ const vp8_tree_index *);
+void vp8cx_pack_mb_row_tokens_armv7(VP8_COMP *cpi, vp8_writer *w,
+ vp8_token *,
+ vp8_extra_bit_struct *,
+ const vp8_tree_index *);
+# define pack_tokens(a,b,c) \
+ vp8cx_pack_tokens_armv7(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_tokens_into_partitions(a,b,c,d) \
+ vp8cx_pack_tokens_into_partitions_armv7(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_mb_row_tokens(a,b) \
+ vp8cx_pack_mb_row_tokens_armv7(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+#else
+# define pack_tokens(a,b,c) pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d) pack_tokens_into_partitions_c(a,b,c,d)
+# define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b)
+#endif
+#endif
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
new file mode 100644
index 000000000..cc4cbe067
--- /dev/null
+++ b/vp8/encoder/block.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCK_H
+#define __INC_BLOCK_H
+
+#include "onyx.h"
+#include "blockd.h"
+#include "entropymv.h"
+#include "entropy.h"
+#include "vpx_ports/mem.h"
+
+// motion search site
+typedef struct
+{
+ MV mv;
+ int offset;
+} search_site;
+
+typedef struct
+{
+ // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+ short *src_diff;
+ short *coeff;
+
+ // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+ short(*quant)[4];
+ short(*zbin)[4];
+ short(*zrun_zbin_boost);
+ short(*round)[4];
+
+ // Zbin Over Quant value
+ short zbin_extra;
+
+ unsigned char **base_src;
+ int src;
+ int src_stride;
+
+// MV enc_mv;
+ int force_empty;
+
+} BLOCK;
+
+typedef struct
+{
+ DECLARE_ALIGNED(16, short, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+ DECLARE_ALIGNED(16, short, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+
+ // 16 Y blocks, 4 U blocks, 4 V blocks, 1 DC 2nd order block each with 16 entries
+ BLOCK block[25];
+
+ YV12_BUFFER_CONFIG src;
+
+ MACROBLOCKD e_mbd;
+
+ search_site *ss;
+ int ss_count;
+ int searches_per_step;
+
+ int errorperbit;
+ int sadperbit16;
+ int sadperbit4;
+ int errthresh;
+ int rddiv;
+ int rdmult;
+
+ int mvcosts[2][MVvals+1];
+ int *mvcost[2];
+ int mvsadcosts[2][MVvals+1];
+ int *mvsadcost[2];
+ int mbmode_cost[2][MB_MODE_COUNT];
+ int intra_uv_mode_cost[2][MB_MODE_COUNT];
+ unsigned int bmode_costs[10][10][10];
+ unsigned int inter_bmode_costs[B_MODE_COUNT];
+
+ // These define limits to motion vector components to prevent them from extending outside the UMV borders
+ int mv_col_min;
+ int mv_col_max;
+ int mv_row_min;
+ int mv_row_max;
+
+ int vector_range; // Used to monitor limiting range of recent vectors to guide search.
+ int skip;
+
+ int encode_breakout;
+
+ unsigned char *active_ptr;
+ MV_CONTEXT *mvc;
+
+ unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
+ int optimize;
+
+ void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+ void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+ void (*short_fdct4x4rd)(short *input, short *output, int pitch);
+ void (*short_fdct8x4rd)(short *input, short *output, int pitch);
+ void (*vp8_short_fdct4x4_ptr)(short *input, short *output, int pitch);
+ void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+ void (*quantize_b)(BLOCK *b, BLOCKD *d);
+ void (*quantize_brd)(BLOCK *b, BLOCKD *d);
+
+
+
+} MACROBLOCK;
+
+
+#endif
diff --git a/vp8/encoder/boolhuff.c b/vp8/encoder/boolhuff.c
new file mode 100644
index 000000000..c101384d9
--- /dev/null
+++ b/vp8/encoder/boolhuff.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+#include "blockd.h"
+
+
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp8_prob_cost[256] =
+{
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+ 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
+ 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
+ 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
+ 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
+ 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
+ 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
+ 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
+ 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
+ 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
+ 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
+};
+
+void vp8_start_encode(BOOL_CODER *br, unsigned char *source)
+{
+
+ br->lowvalue = 0;
+ br->range = 255;
+ br->value = 0;
+ br->count = -24;
+ br->buffer = source;
+ br->pos = 0;
+}
+
+void vp8_stop_encode(BOOL_CODER *br)
+{
+ int i;
+
+ for (i = 0; i < 32; i++)
+ vp8_encode_bool(br, 0, 128);
+}
+
+DECLARE_ALIGNED(16, static const unsigned int, norm[256]) =
+{
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
+{
+ unsigned int split;
+ int count = br->count;
+ unsigned int range = br->range;
+ unsigned int lowvalue = br->lowvalue;
+ register unsigned int shift;
+
+#ifdef ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+ if (bit)
+ Sectionbits[active_section] += vp8_prob_cost[255-probability];
+ else
+ Sectionbits[active_section] += vp8_prob_cost[probability];
+
+#endif
+#endif
+
+ split = 1 + (((range - 1) * probability) >> 8);
+
+ range = split;
+
+ if (bit)
+ {
+ lowvalue += split;
+ range = br->range - split;
+ }
+
+ shift = norm[range];
+
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = br->pos - 1;
+
+ while (x >= 0 && br->buffer[x] == 0xff)
+ {
+ br->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ br->buffer[x] += 1;
+ }
+
+ br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ br->count = count;
+ br->lowvalue = lowvalue;
+ br->range = range;
+}
+
+void vp8_encode_value(BOOL_CODER *br, int data, int bits)
+{
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--)
+ vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+
+}
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
new file mode 100644
index 000000000..0d929f067
--- /dev/null
+++ b/vp8/encoder/boolhuff.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : boolhuff.h
+*
+* Description : Bool Coder header file.
+*
+****************************************************************************/
+#ifndef __INC_BOOLHUFF_H
+#define __INC_BOOLHUFF_H
+
+
+typedef struct
+{
+ unsigned int lowvalue;
+ unsigned int range;
+ unsigned int value;
+ int count;
+ unsigned int pos;
+ unsigned char *buffer;
+
+ // Variables used to track bit costs without outputing to the bitstream
+ unsigned int measure_cost;
+ unsigned long bit_counter;
+} BOOL_CODER;
+
+extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer);
+extern void vp8_encode_bool(BOOL_CODER *bc, int x, int context);
+extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp8_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp8_prob_cost[256];
+
+#endif
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
new file mode 100644
index 000000000..5207e39c4
--- /dev/null
+++ b/vp8/encoder/dct.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <math.h>
+
+
+static const short dct_matrix2[4][4] =
+{
+ { 23170, 30274, 23170, 12540 },
+ { 23170, 12540, -23170, -30274 },
+ { 23170, -12540, -23170, 30274 },
+ { 23170, -30274, 23170, -12540 }
+};
+
+static const short dct_matrix1[4][4] =
+{
+ { 23170, 23170, 23170, 23170 },
+ { 30274, 12540, -12540, -30274 },
+ { 23170, -23170, -23170, 23170 },
+ { 12540, -30274, 30274, -12540 }
+};
+
+
+#define _1STSTAGESHIFT 14
+#define _1STSTAGEROUNDING (1<<( _1STSTAGESHIFT-1))
+#define _2NDSTAGESHIFT 16
+#define _2NDSTAGEROUNDING (1<<( _2NDSTAGESHIFT-1))
+
+// using matrix multiply
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+{
+ int i, j, k;
+ short temp[4][4];
+ int sumtemp;
+ pitch >>= 1;
+
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ sumtemp = 0;
+
+ for (k = 0; k < 4; k++)
+ {
+ sumtemp += input[i*pitch+k] * dct_matrix2[k][j];
+
+ }
+
+ temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT);
+ }
+ }
+
+
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ sumtemp = 0;
+
+ for (k = 0; k < 4; k++)
+ {
+ sumtemp += dct_matrix1[i][ k] * temp[k][ j];
+ }
+
+ output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT);
+ }
+ }
+
+}
+
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+
+static const signed short x_c1 = 60547;
+static const signed short x_c2 = 46341;
+static const signed short x_c3 = 25080;
+
+void vp8_fast_fdct4x4_c(short *input, short *output, int pitch)
+{
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+
+ short *op = output;
+ int temp1, temp2;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = (ip[0] + ip[3]) * 2;
+ b1 = (ip[1] + ip[2]) * 2;
+ c1 = (ip[1] - ip[2]) * 2;
+ d1 = (ip[0] - ip[3]) * 2;
+
+ temp1 = a1 + b1;
+ temp2 = a1 - b1;
+
+ op[0] = ((temp1 * x_c2) >> 16) + temp1;
+ op[2] = ((temp2 * x_c2) >> 16) + temp2;
+
+ temp1 = (c1 * x_c3) >> 16;
+ temp2 = ((d1 * x_c1) >> 16) + d1;
+
+ op[1] = temp1 + temp2;
+
+ temp1 = (d1 * x_c3) >> 16;
+ temp2 = ((c1 * x_c1) >> 16) + c1;
+
+ op[3] = temp1 - temp2;
+
+ ip += pitch / 2;
+ op += 4;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+
+ temp1 = a1 + b1;
+ temp2 = a1 - b1;
+
+ a2 = ((temp1 * x_c2) >> 16) + temp1;
+ c2 = ((temp2 * x_c2) >> 16) + temp2;
+
+ temp1 = (c1 * x_c3) >> 16;
+ temp2 = ((d1 * x_c1) >> 16) + d1;
+
+ b2 = temp1 + temp2;
+
+ temp1 = (d1 * x_c3) >> 16;
+ temp2 = ((c1 * x_c1) >> 16) + c1;
+
+ d2 = temp1 - temp2;
+
+
+ op[0] = (a2 + 1) >> 1;
+ op[4] = (b2 + 1) >> 1;
+ op[8] = (c2 + 1) >> 1;
+ op[12] = (d2 + 1) >> 1;
+
+ ip++;
+ op++;
+ }
+}
+
+void vp8_fast_fdct8x4_c(short *input, short *output, int pitch)
+{
+ vp8_fast_fdct4x4_c(input, output, pitch);
+ vp8_fast_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+{
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ op[0] = a1 + b1;
+ op[1] = c1 + d1;
+ op[2] = a1 - b1;
+ op[3] = d1 - c1;
+ ip += pitch / 2;
+ op += 4;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ a2 = a1 + b1;
+ b2 = c1 + d1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ a2 += (a2 > 0);
+ b2 += (b2 > 0);
+ c2 += (c2 > 0);
+ d2 += (d2 > 0);
+
+ op[0] = (a2) >> 1;
+ op[4] = (b2) >> 1;
+ op[8] = (c2) >> 1;
+ op[12] = (d2) >> 1;
+
+ ip++;
+ op++;
+ }
+}
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
new file mode 100644
index 000000000..fb307cfb3
--- /dev/null
+++ b/vp8/encoder/dct.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_DCT_H
+#define __INC_DCT_H
+
+#define prototype_fdct(sym) void (sym)(short *input, short *output, int pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/dct_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/dct_arm.h"
+#endif
+
+#ifndef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_c
+#endif
+extern prototype_fdct(vp8_fdct_short4x4);
+
+#ifndef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_c
+#endif
+extern prototype_fdct(vp8_fdct_short8x4);
+
+#ifndef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_c
+#endif
+extern prototype_fdct(vp8_fdct_fast4x4);
+
+#ifndef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_c
+#endif
+extern prototype_fdct(vp8_fdct_fast8x4);
+
+#ifndef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_c
+#endif
+extern prototype_fdct(vp8_fdct_walsh_short4x4);
+
+typedef prototype_fdct(*vp8_fdct_fn_t);
+typedef struct
+{
+ vp8_fdct_fn_t short4x4;
+ vp8_fdct_fn_t short8x4;
+ vp8_fdct_fn_t fast4x4;
+ vp8_fdct_fn_t fast8x4;
+ vp8_fdct_fn_t walsh_short4x4;
+} vp8_fdct_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define FDCT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define FDCT_INVOKE(ctx,fn) vp8_fdct_##fn
+#endif
+
+#endif
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
new file mode 100644
index 000000000..a4e377220
--- /dev/null
+++ b/vp8/encoder/encodeframe.c
@@ -0,0 +1,1223 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "common.h"
+#include "onyx_int.h"
+#include "extend.h"
+#include "entropymode.h"
+#include "quant_common.h"
+#include "segmentation_common.h"
+#include "setupintrarecon.h"
+#include "encodeintra.h"
+#include "reconinter.h"
+#include "rdopt.h"
+#include "pickinter.h"
+#include "findnearmv.h"
+#include "reconintra.h"
+#include <stdio.h>
+#include <limits.h>
+#include "subpixel.h"
+#include "vpx_ports/vpx_timer.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD(x) &cpi->common.rtcd.x
+#define IF_RTCD(x) (x)
+#else
+#define RTCD(x) NULL
+#define IF_RTCD(x) NULL
+#endif
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+
+extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+extern void vp8_auto_select_speed(VP8_COMP *cpi);
+extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+ MACROBLOCK *x,
+ MB_ROW_COMP *mbr_ei,
+ int mb_row,
+ int count);
+void vp8_build_block_offsets(MACROBLOCK *x);
+void vp8_setup_block_ptrs(MACROBLOCK *x);
+int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int inter_uv_modes[4] = {0, 0, 0, 0};
+unsigned int inter_b_modes[15] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int y_modes[5] = {0, 0, 0, 0, 0};
+unsigned int uv_modes[4] = {0, 0, 0, 0};
+unsigned int b_modes[14] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+// The first four entries are dummy values
+static const int qrounding_factors[129] =
+{
+ 56, 56, 56, 56, 56, 56, 56, 56,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48,
+};
+
+static const int qzbin_factors[129] =
+{
+ 64, 64, 64, 64, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80,
+};
+
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+ int r, c;
+ int i;
+ int quant_val;
+ int Q;
+
+ int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++)
+ {
+ // dc values
+ quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+ cpi->Y1quant[Q][0][0] = (1 << 16) / quant_val;
+ cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][0][0] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+ cpi->Y2quant[Q][0][0] = (1 << 16) / quant_val;
+ cpi->Y2zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][0][0] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+ cpi->UVquant[Q][0][0] = (1 << 16) / quant_val;
+ cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+ cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][0][0] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ // all the ac values = ;
+ for (i = 1; i < 16; i++)
+ {
+ int rc = vp8_default_zig_zag1d[i];
+ r = (rc >> 2);
+ c = (rc & 3);
+
+ quant_val = vp8_ac_yquant(Q);
+ cpi->Y1quant[Q][r][c] = (1 << 16) / quant_val;
+ cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][r][c] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+ quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2quant[Q][r][c] = (1 << 16) / quant_val;
+ cpi->Y2zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][r][c] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+ quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVquant[Q][r][c] = (1 << 16) / quant_val;
+ cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][r][c] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+ }
+ }
+}
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
+{
+ int i;
+ int QIndex;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mbmi;
+ int zbin_extra;
+
+ // Select the baseline MB Q index.
+ if (xd->segmentation_enabled)
+ {
+ // Abs Value
+ if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+ // Delta Value
+ else
+ {
+ QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; // Clamp to valid range
+ }
+ }
+ else
+ QIndex = cpi->common.base_qindex;
+
+ // Y
+ zbin_extra = (cpi->common.Y1dequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+
+ for (i = 0; i < 16; i++)
+ {
+ x->block[i].quant = cpi->Y1quant[QIndex];
+ x->block[i].zbin = cpi->Y1zbin[QIndex];
+ x->block[i].round = cpi->Y1round[QIndex];
+ x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
+ x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+ x->block[i].zbin_extra = (short)zbin_extra;
+ }
+
+ // UV
+ zbin_extra = (cpi->common.UVdequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+
+ for (i = 16; i < 24; i++)
+ {
+ x->block[i].quant = cpi->UVquant[QIndex];
+ x->block[i].zbin = cpi->UVzbin[QIndex];
+ x->block[i].round = cpi->UVround[QIndex];
+ x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
+ x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+ x->block[i].zbin_extra = (short)zbin_extra;
+ }
+
+ // Y2
+ zbin_extra = (cpi->common.Y2dequant[QIndex][0][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+ x->block[24].quant = cpi->Y2quant[QIndex];
+ x->block[24].zbin = cpi->Y2zbin[QIndex];
+ x->block[24].round = cpi->Y2round[QIndex];
+ x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
+ x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+ x->block[24].zbin_extra = (short)zbin_extra;
+}
+
+void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
+{
+ // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
+ // when these values are not all zero.
+ if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
+ {
+ vp8cx_init_quantizer(cpi);
+ }
+
+ // MB level quantizer setup
+ vp8cx_mb_init_quantizer(cpi, &cpi->mb);
+}
+
+
+
+static
+void encode_mb_row(VP8_COMP *cpi,
+ VP8_COMMON *cm,
+ int mb_row,
+ MACROBLOCK *x,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **tp,
+ int *segment_counts,
+ int *totalrate)
+{
+ int i;
+ int recon_yoffset, recon_uvoffset;
+ int mb_col;
+ int recon_y_stride = cm->last_frame.y_stride;
+ int recon_uv_stride = cm->last_frame.uv_stride;
+ int seg_map_index = (mb_row * cpi->common.mb_cols);
+
+
+ // reset above block coeffs
+ xd->above_context[Y1CONTEXT] = cm->above_context[Y1CONTEXT];
+ xd->above_context[UCONTEXT ] = cm->above_context[UCONTEXT ];
+ xd->above_context[VCONTEXT ] = cm->above_context[VCONTEXT ];
+ xd->above_context[Y2CONTEXT] = cm->above_context[Y2CONTEXT];
+
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+ cpi->tplist[mb_row].start = *tp;
+ //printf("Main mb_row = %d\n", mb_row);
+
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ // Distance of Mb to the various image edges.
+ // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+ // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+ xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ // Is segmentation enabled
+ // MB level adjutment to quantizer
+ if (xd->segmentation_enabled)
+ {
+ // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+ if (cpi->segmentation_map[seg_map_index+mb_col] <= 3)
+ xd->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col];
+ else
+ xd->mbmi.segment_id = 0;
+
+ vp8cx_mb_init_quantizer(cpi, x);
+ }
+ else
+ xd->mbmi.segment_id = 0; // Set to Segment 0 by default
+
+ x->active_ptr = cpi->active_map + seg_map_index + mb_col;
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+#ifdef MODE_STATS
+ y_modes[xd->mbmi.mode] ++;
+#endif
+ }
+ else
+ {
+ *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+
+#ifdef MODE_STATS
+ inter_y_modes[xd->mbmi.mode] ++;
+
+ if (xd->mbmi.mode == SPLITMV)
+ {
+ int b;
+
+ for (b = 0; b < xd->mbmi.partition_count; b++)
+ {
+ inter_b_modes[xd->mbmi.partition_bmi[b].mode] ++;
+ }
+ }
+
+#endif
+
+ // Count of last ref frame 0,0 useage
+ if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME))
+ cpi->inter_zz_count ++;
+
+ // Special case code for cyclic refresh
+ // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
+ // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+ if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+ {
+ cpi->segmentation_map[seg_map_index+mb_col] = xd->mbmi.segment_id;
+
+ // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
+ // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
+ // else mark it as dirty (1).
+ if (xd->mbmi.segment_id)
+ cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
+ else if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME))
+ {
+ if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
+ cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
+ }
+ else
+ cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
+
+ }
+ }
+
+ cpi->tplist[mb_row].stop = *tp;
+
+ xd->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb
+
+ // store macroblock mode info into context array
+ vpx_memcpy(&xd->mode_info_context->mbmi, &xd->mbmi, sizeof(xd->mbmi));
+
+ for (i = 0; i < 16; i++)
+ vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+
+ // adjust to the next column of macroblocks
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ // Keep track of segment useage
+ segment_counts[xd->mbmi.segment_id] ++;
+
+ // skip to next mb
+ xd->mode_info_context++;
+
+ xd->above_context[Y1CONTEXT] += 4;
+ xd->above_context[UCONTEXT ] += 2;
+ xd->above_context[VCONTEXT ] += 2;
+ xd->above_context[Y2CONTEXT] ++;
+ cpi->current_mb_col_main = mb_col;
+ }
+
+ //extend the recon for intra prediction
+ vp8_extend_mb_row(
+ &cm->new_frame,
+ xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8,
+ xd->dst.v_buffer + 8);
+
+ // this is to account for the border
+ xd->mode_info_context++;
+}
+
+
+
+
+
+void vp8_encode_frame(VP8_COMP *cpi)
+{
+ int mb_row;
+ MACROBLOCK *const x = & cpi->mb;
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+
+ int i;
+ TOKENEXTRA *tp = cpi->tok;
+ int segment_counts[MAX_MB_SEGMENTS];
+ int totalrate;
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cm->mcomp_filter_type == SIXTAP)
+ {
+ xd->subpixel_predict = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap4x4);
+ xd->subpixel_predict8x4 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x4);
+ xd->subpixel_predict8x8 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x8);
+ xd->subpixel_predict16x16 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap16x16);
+ }
+ else
+ {
+ xd->subpixel_predict = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear4x4);
+ xd->subpixel_predict8x4 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x4);
+ xd->subpixel_predict8x8 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x8);
+ xd->subpixel_predict16x16 = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear16x16);
+ }
+ }
+
+ //else // Key Frame
+ //{
+ // For key frames make sure the intra ref frame probability value
+ // is set to "all intra"
+ //cpi->prob_intra_coded = 255;
+ //}
+
+
+ xd->gf_active_ptr = (signed char *)cm->gf_active_flags; // Point to base of GF active flags data structure
+
+ x->vector_range = 32;
+
+ // Count of MBs using the alternate Q if any
+ cpi->alt_qcount = 0;
+
+ // Reset frame count of inter 0,0 motion vector useage.
+ cpi->inter_zz_count = 0;
+
+ vpx_memset(segment_counts, 0, sizeof(segment_counts));
+
+ cpi->prediction_error = 0;
+ cpi->intra_error = 0;
+ cpi->skip_true_count = 0;
+ cpi->skip_false_count = 0;
+
+#if 0
+ // Experimental code
+ cpi->frame_distortion = 0;
+ cpi->last_mb_distortion = 0;
+#endif
+
+ totalrate = 0;
+
+ xd->mode_info = cm->mi - 1;
+
+ xd->mode_info_context = cm->mi;
+ xd->mode_info_stride = cm->mode_info_stride;
+
+ xd->frame_type = cm->frame_type;
+
+ xd->frames_since_golden = cm->frames_since_golden;
+ xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+ vp8_zero(cpi->MVcount);
+ // vp8_zero( Contexts)
+ vp8_zero(cpi->coef_counts);
+
+ // reset intra mode contexts
+ if (cm->frame_type == KEY_FRAME)
+ vp8_init_mbmode_probs(cm);
+
+
+ vp8cx_frame_init_quantizer(cpi);
+
+ if (cpi->compressor_speed == 2)
+ {
+ if (cpi->oxcf.cpu_used < 0)
+ cpi->Speed = -(cpi->oxcf.cpu_used);
+ else
+ vp8_auto_select_speed(cpi);
+ }
+
+ vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+ //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) );
+ vp8cx_initialize_me_consts(cpi, cm->base_qindex);
+ //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex);
+
+ // Copy data over into macro block data sturctures.
+
+ x->src = * cpi->Source;
+ xd->pre = cm->last_frame;
+ xd->dst = cm->new_frame;
+
+ // set up frame new frame for intra coded blocks
+
+ vp8_setup_intra_recon(&cm->new_frame);
+
+ vp8_build_block_offsets(x);
+
+ vp8_setup_block_dptrs(&x->e_mbd);
+
+ vp8_setup_block_ptrs(x);
+
+ x->rddiv = cpi->RDDIV;
+ x->rdmult = cpi->RDMULT;
+
+#if 0
+ // Experimental rd code
+ // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
+ // such as cpi->rate_correction_factor that indicate relative complexity.
+ /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
+ {
+ //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
+ x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
+ }
+ else
+ x->rdmult = cpi->RDMULT; */
+ //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
+#endif
+
+ xd->mbmi.mode = DC_PRED;
+ xd->mbmi.uv_mode = DC_PRED;
+
+ xd->left_context = cm->left_context;
+
+ vp8_zero(cpi->count_mb_ref_frame_usage)
+ vp8_zero(cpi->ymode_count)
+ vp8_zero(cpi->uv_mode_count)
+
+ x->mvc = cm->fc.mvc;
+
+ // vp8_zero( entropy_stats)
+ {
+ ENTROPY_CONTEXT **p = cm->above_context;
+ const size_t L = cm->mb_cols;
+
+ vp8_zero_array(p [Y1CONTEXT], L * 4)
+ vp8_zero_array(p [ UCONTEXT], L * 2)
+ vp8_zero_array(p [ VCONTEXT], L * 2)
+ vp8_zero_array(p [Y2CONTEXT], L)
+ }
+
+
+ {
+ struct vpx_usec_timer emr_timer;
+ vpx_usec_timer_start(&emr_timer);
+
+ if (!cpi->b_multi_threaded)
+ {
+ // for each macroblock row in image
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+
+ vp8_zero(cm->left_context)
+
+ encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+ // adjust to the next row of mbs
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ }
+
+ cpi->tok_count = tp - cpi->tok;
+
+ }
+ else
+ {
+#if CONFIG_MULTITHREAD
+ vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1, cpi->encoding_thread_count);
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+ {
+ int i;
+ cpi->current_mb_col_main = -1;
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ if ((mb_row + i + 1) >= cm->mb_rows)
+ break;
+
+ cpi->mb_row_ei[i].mb_row = mb_row + i + 1;
+ cpi->mb_row_ei[i].tp = cpi->tok + (mb_row + i + 1) * (cm->mb_cols * 16 * 24);
+ cpi->mb_row_ei[i].current_mb_col = -1;
+ //SetEvent(cpi->h_event_mbrencoding[i]);
+ sem_post(&cpi->h_event_mbrencoding[i]);
+ }
+
+ vp8_zero(cm->left_context)
+
+ tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+
+ encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+ // adjust to the next row of mbs
+ x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+ xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+
+ if (mb_row < cm->mb_rows - 1)
+ //WaitForSingleObject(cpi->h_event_main, INFINITE);
+ sem_wait(&cpi->h_event_main);
+ }
+
+ /*
+ for( ;mb_row<cm->mb_rows; mb_row ++)
+ {
+ vp8_zero( cm->left_context)
+
+ tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+
+ encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+ // adjust to the next row of mbs
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+ }
+ */
+ cpi->tok_count = 0;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+ {
+ cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
+ }
+
+ if (xd->segmentation_enabled)
+ {
+
+ int i, j;
+
+ if (xd->segmentation_enabled)
+ {
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ for (j = 0; j < 4; j++)
+ segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+ }
+ }
+
+ }
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ totalrate += cpi->mb_row_ei[i].totalrate;
+ }
+
+#endif
+
+ }
+
+ vpx_usec_timer_mark(&emr_timer);
+ cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+
+ }
+
+
+ // Work out the segment probabilites if segmentation is enabled
+ if (xd->segmentation_enabled)
+ {
+ int tot_count;
+ int i;
+
+ // Set to defaults
+ vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+
+ tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+
+ if (tot_count)
+ {
+ xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count;
+
+ tot_count = segment_counts[0] + segment_counts[1];
+
+ if (tot_count > 0)
+ {
+ xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count;
+ }
+
+ tot_count = segment_counts[2] + segment_counts[3];
+
+ if (tot_count > 0)
+ xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
+
+ // Zero probabilities not allowed
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
+ {
+ if (xd->mb_segment_tree_probs[i] == 0)
+ xd->mb_segment_tree_probs[i] = 1;
+ }
+ }
+ }
+
+ // 256 rate units to the bit
+ cpi->projected_frame_size = totalrate >> 8; // projected_frame_size in units of BYTES
+
+ // Make a note of the percentage MBs coded Intra.
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->this_frame_percent_intra = 100;
+ }
+ else
+ {
+ int tot_modes;
+
+ tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME]
+ + cpi->count_mb_ref_frame_usage[LAST_FRAME]
+ + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]
+ + cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+
+ if (tot_modes)
+ cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+
+ }
+
+#if 0
+ {
+ int cnt = 0;
+ int flag[2] = {0, 0};
+
+ for (cnt = 0; cnt < MVPcount; cnt++)
+ {
+ if (cm->fc.pre_mvc[0][cnt] != cm->fc.mvc[0][cnt])
+ {
+ flag[0] = 1;
+ vpx_memcpy(cm->fc.pre_mvc[0], cm->fc.mvc[0], MVPcount);
+ break;
+ }
+ }
+
+ for (cnt = 0; cnt < MVPcount; cnt++)
+ {
+ if (cm->fc.pre_mvc[1][cnt] != cm->fc.mvc[1][cnt])
+ {
+ flag[1] = 1;
+ vpx_memcpy(cm->fc.pre_mvc[1], cm->fc.mvc[1], MVPcount);
+ break;
+ }
+ }
+
+ if (flag[0] || flag[1])
+ vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+ }
+#endif
+
+ // Adjust the projected reference frame useage probability numbers to reflect
+ // what we have just seen. This may be usefull when we make multiple itterations
+ // of the recode loop rather than continuing to use values from the previous frame.
+ if ((cm->frame_type != KEY_FRAME) && !cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)
+ {
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ if ((rf_intra + rf_inter) > 0)
+ {
+ cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
+
+ if (cpi->prob_intra_coded < 1)
+ cpi->prob_intra_coded = 1;
+
+ if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
+ {
+ cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ if (cpi->prob_last_coded < 1)
+ cpi->prob_last_coded = 1;
+
+ cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ if (cpi->prob_gf_coded < 1)
+ cpi->prob_gf_coded = 1;
+ }
+ }
+ }
+
+#if 0
+ // Keep record of the total distortion this time around for future use
+ cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+
+}
+void vp8_setup_block_ptrs(MACROBLOCK *x)
+{
+ int r, c;
+ int i;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ x->block[r*4+c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[16 + r*2+c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+ }
+ }
+
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[20 + r*2+c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+ }
+ }
+
+ x->block[24].src_diff = x->src_diff + 384;
+
+
+ for (i = 0; i < 25; i++)
+ {
+ x->block[i].coeff = x->coeff + i * 16;
+ }
+}
+
+void vp8_build_block_offsets(MACROBLOCK *x)
+{
+ int block = 0;
+ int br, bc;
+
+ vp8_build_block_doffsets(&x->e_mbd);
+
+ // y blocks
+ for (br = 0; br < 4; br++)
+ {
+ for (bc = 0; bc < 4; bc++)
+ {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.y_buffer;
+ this_block->src_stride = x->src.y_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+
+ // u blocks
+ for (br = 0; br < 2; br++)
+ {
+ for (bc = 0; bc < 2; bc++)
+ {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.u_buffer;
+ this_block->src_stride = x->src.uv_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+
+ // v blocks
+ for (br = 0; br < 2; br++)
+ {
+ for (bc = 0; bc < 2; bc++)
+ {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.v_buffer;
+ this_block->src_stride = x->src.uv_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+}
+
+static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
+{
+ const MACROBLOCKD *xd = & x->e_mbd;
+ const MB_PREDICTION_MODE m = xd->mbmi.mode;
+ const MB_PREDICTION_MODE uvm = xd->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+ const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+ ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+
+ if (m == B_PRED)
+ {
+ unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+ int b = 0;
+
+ do
+ {
+ ++ bct[xd->block[b].bmi.mode];
+ }
+ while (++b < 16);
+ }
+
+#endif
+
+ ++cpi->ymode_count[m];
+ ++cpi->uv_mode_count[uvm];
+
+}
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+ int Error4x4, Error16x16, error_uv;
+ B_PREDICTION_MODE intra_bmodes[16];
+ int rate4x4, rate16x16, rateuv;
+ int dist4x4, dist16x16, distuv;
+ int rate = 0;
+ int rate4x4_tokenonly = 0;
+ int rate16x16_tokenonly = 0;
+ int rateuv_tokenonly = 0;
+ int i;
+
+ x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->sf.RD || cpi->compressor_speed != 2)
+ {
+ Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
+
+ //save the b modes for possible later use
+ for (i = 0; i < 16; i++)
+ intra_bmodes[i] = x->e_mbd.block[i].bmi.mode;
+
+ Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);
+
+ error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+
+ x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+
+ vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+ rate += rateuv;
+
+ if (Error4x4 < Error16x16)
+ {
+ rate += rate4x4;
+ x->e_mbd.mbmi.mode = B_PRED;
+
+ // get back the intra block modes
+ for (i = 0; i < 16; i++)
+ x->e_mbd.block[i].bmi.mode = intra_bmodes[i];
+
+ vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+ cpi->prediction_error += Error4x4 ;
+#if 0
+ // Experimental RD code
+ cpi->frame_distortion += dist4x4;
+#endif
+ }
+ else
+ {
+ vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ rate += rate16x16;
+
+#if 0
+ // Experimental RD code
+ cpi->prediction_error += Error16x16;
+ cpi->frame_distortion += dist16x16;
+#endif
+ }
+
+ sum_intra_stats(cpi, x);
+
+ vp8_tokenize_mb(cpi, &x->e_mbd, t);
+ }
+ else
+#endif
+ {
+
+ int rate2, distortion2;
+ MB_PREDICTION_MODE mode, best_mode = DC_PRED;
+ int this_rd;
+ Error16x16 = INT_MAX;
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode ++)
+ {
+ x->e_mbd.mbmi.mode = mode;
+ vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+ rate2 = x->mbmode_cost[x->e_mbd.frame_type][mode];
+ this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (Error16x16 > this_rd)
+ {
+ Error16x16 = this_rd;
+ best_mode = mode;
+ }
+ }
+
+ vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &distortion2);
+
+ if (distortion2 == INT_MAX)
+ Error4x4 = INT_MAX;
+ else
+ Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+ x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+
+ if (Error4x4 < Error16x16)
+ {
+ x->e_mbd.mbmi.mode = B_PRED;
+ vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+ cpi->prediction_error += Error4x4;
+ }
+ else
+ {
+ x->e_mbd.mbmi.mode = best_mode;
+ vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ cpi->prediction_error += Error16x16;
+ }
+
+ vp8_pick_intra_mbuv_mode(x);
+ vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+ sum_intra_stats(cpi, x);
+ vp8_tokenize_mb(cpi, &x->e_mbd, t);
+ }
+
+ return rate;
+}
+#ifdef SPEEDSTATS
+extern int cnt_pm;
+#endif
+
+extern void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x);
+
+int vp8cx_encode_inter_macroblock
+(
+ VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+ int recon_yoffset, int recon_uvoffset
+)
+{
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int inter_error;
+ int intra_error = 0;
+ int rate;
+ int distortion;
+
+ x->skip = 0;
+
+ if (xd->segmentation_enabled)
+ x->encode_breakout = cpi->segment_encode_breakout[xd->mbmi.segment_id];
+ else
+ x->encode_breakout = cpi->oxcf.encode_breakout;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->sf.RD)
+ {
+ inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+ }
+ else
+#endif
+ inter_error = vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+
+
+ cpi->prediction_error += inter_error;
+ cpi->intra_error += intra_error;
+
+#if 0
+ // Experimental RD code
+ cpi->frame_distortion += distortion;
+ cpi->last_mb_distortion = distortion;
+#endif
+
+ // MB level adjutment to quantizer setup
+ if (xd->segmentation_enabled || cpi->zbin_mode_boost_enabled)
+ {
+ // If cyclic update enabled
+ if (cpi->cyclic_refresh_mode_enabled)
+ {
+ // Clear segment_id back to 0 if not coded (last frame 0,0)
+ if ((xd->mbmi.segment_id == 1) &&
+ ((xd->mbmi.ref_frame != LAST_FRAME) || (xd->mbmi.mode != ZEROMV)))
+ {
+ xd->mbmi.segment_id = 0;
+ }
+ }
+
+ // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
+ if (cpi->zbin_mode_boost_enabled)
+ {
+ if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame != LAST_FRAME))
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = 0;
+ }
+
+ vp8cx_mb_init_quantizer(cpi, x);
+ }
+
+ cpi->count_mb_ref_frame_usage[xd->mbmi.ref_frame] ++;
+
+ if (xd->mbmi.ref_frame == INTRA_FRAME)
+ {
+ x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+
+ vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
+ if (xd->mbmi.mode == B_PRED)
+ {
+ vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+ }
+ else
+ {
+ vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ }
+
+ sum_intra_stats(cpi, x);
+ }
+ else
+ {
+ MV best_ref_mv;
+ MV nearest, nearby;
+ int mdcounts[4];
+
+ vp8_find_near_mvs(xd, xd->mode_info_context,
+ &nearest, &nearby, &best_ref_mv, mdcounts, xd->mbmi.ref_frame, cpi->common.ref_frame_sign_bias);
+
+ vp8_build_uvmvs(xd, cpi->common.full_pixel);
+
+ // store motion vectors in our motion vector list
+ if (xd->mbmi.ref_frame == LAST_FRAME)
+ {
+ // Set up pointers for this macro block into the previous frame recon buffer
+ xd->pre.y_buffer = cpi->common.last_frame.y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cpi->common.last_frame.u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cpi->common.last_frame.v_buffer + recon_uvoffset;
+ }
+ else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
+ {
+ // Set up pointers for this macro block into the golden frame recon buffer
+ xd->pre.y_buffer = cpi->common.golden_frame.y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cpi->common.golden_frame.u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cpi->common.golden_frame.v_buffer + recon_uvoffset;
+ }
+ else
+ {
+ // Set up pointers for this macro block into the alternate reference frame recon buffer
+ xd->pre.y_buffer = cpi->common.alt_ref_frame.y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset;
+ }
+
+ if (xd->mbmi.mode == SPLITMV)
+ {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ if (xd->block[i].bmi.mode == NEW4X4)
+ {
+ cpi->MVcount[0][mv_max+((xd->block[i].bmi.mv.as_mv.row - best_ref_mv.row) >> 1)]++;
+ cpi->MVcount[1][mv_max+((xd->block[i].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++;
+ }
+ }
+ }
+ else if (xd->mbmi.mode == NEWMV)
+ {
+ cpi->MVcount[0][mv_max+((xd->block[0].bmi.mv.as_mv.row - best_ref_mv.row) >> 1)]++;
+ cpi->MVcount[1][mv_max+((xd->block[0].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++;
+ }
+
+ if (!x->skip && !x->e_mbd.mbmi.force_no_skip)
+ {
+ vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
+
+ // Clear mb_skip_coeff if mb_no_coeff_skip is not set
+ if (!cpi->common.mb_no_coeff_skip)
+ xd->mbmi.mb_skip_coeff = 0;
+
+ }
+ else
+ vp8_stuff_inter16x16(x);
+ }
+
+ if (!x->skip)
+ vp8_tokenize_mb(cpi, xd, t);
+ else
+ {
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
+ xd->mbmi.dc_diff = 0;
+ else
+ xd->mbmi.dc_diff = 1;
+
+ xd->mbmi.mb_skip_coeff = 1;
+ cpi->skip_true_count ++;
+ vp8_fix_contexts(cpi, xd);
+ }
+ else
+ {
+ vp8_stuff_mb(cpi, xd, t);
+ xd->mbmi.mb_skip_coeff = 0;
+ cpi->skip_false_count ++;
+ }
+ }
+
+ return rate;
+}
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
new file mode 100644
index 000000000..403d0204a
--- /dev/null
+++ b/vp8/encoder/encodeintra.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "quantize.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "encodemb.h"
+#include "invtrans.h"
+#include "recon.h"
+#include "dct.h"
+#include "g_common.h"
+#include "encodeintra.h"
+
+#define intra4x4ibias_rate 128
+#define intra4x4pbias_rate 256
+
+
+void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode)
+{
+ if (i < 12)
+ {
+ abmode[i+4] = best_mode;
+ }
+
+ if ((i & 3) != 3)
+ {
+ lbmode[i+1] = best_mode;
+ }
+
+}
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
+{
+ vp8_predict_intra4x4(b, best_mode, b->predictor);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
+
+ x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+
+ x->quantize_b(be, b);
+
+ x->e_mbd.mbmi.mb_skip_coeff &= (!b->eob);
+
+ vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
+
+ RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
+{
+ vp8_predict_intra4x4(b, best_mode, b->predictor);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
+
+ x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+
+ x->quantize_brd(be, b);
+
+ x->e_mbd.mbmi.mb_skip_coeff &= (!b->eob);
+
+ IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
+
+ RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
+{
+ int i;
+
+ MACROBLOCKD *x = &mb->e_mbd;
+ vp8_intra_prediction_down_copy(x);
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCK *be = &mb->block[i];
+ BLOCKD *b = &x->block[i];
+
+ vp8_encode_intra4x4block(rtcd, mb, be, b, b->bmi.mode);
+ }
+
+ return;
+}
+
+void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ int b;
+
+ vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+
+ vp8_transform_intra_mby(x);
+
+ vp8_quantize_mby(x);
+
+#if !(CONFIG_REALTIME_ONLY)
+#if 1
+
+ if (x->optimize && x->rddiv > 1)
+ vp8_optimize_mby(x, rtcd);
+
+#endif
+#endif
+
+ vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+
+ // make sure block modes are set the way we want them for context updates
+ for (b = 0; b < 16; b++)
+ {
+ BLOCKD *d = &x->e_mbd.block[b];
+
+ switch (x->e_mbd.mbmi.mode)
+ {
+
+ case DC_PRED:
+ d->bmi.mode = B_DC_PRED;
+ break;
+ case V_PRED:
+ d->bmi.mode = B_VE_PRED;
+ break;
+ case H_PRED:
+ d->bmi.mode = B_HE_PRED;
+ break;
+ case TM_PRED:
+ d->bmi.mode = B_TM_PRED;
+ break;
+ default:
+ d->bmi.mode = B_DC_PRED;
+ break;
+
+ }
+ }
+}
+
+void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ int b;
+
+ vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+
+ vp8_transform_intra_mbyrd(x);
+
+ x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+ vp8_quantize_mbyrd(x);
+
+
+ vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+
+ // make sure block modes are set the way we want them for context updates
+ for (b = 0; b < 16; b++)
+ {
+ BLOCKD *d = &x->e_mbd.block[b];
+
+ switch (x->e_mbd.mbmi.mode)
+ {
+
+ case DC_PRED:
+ d->bmi.mode = B_DC_PRED;
+ break;
+ case V_PRED:
+ d->bmi.mode = B_VE_PRED;
+ break;
+ case H_PRED:
+ d->bmi.mode = B_HE_PRED;
+ break;
+ case TM_PRED:
+ d->bmi.mode = B_TM_PRED;
+ break;
+ default:
+ d->bmi.mode = B_DC_PRED;
+ break;
+
+ }
+ }
+}
+
+void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ vp8_build_intra_predictors_mbuv(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+ vp8_transform_mbuv(x);
+
+ vp8_quantize_mbuv(x);
+
+#if !(CONFIG_REALTIME_ONLY)
+#if 1
+
+ if (x->optimize && x->rddiv > 1)
+ vp8_optimize_mbuv(x, rtcd);
+
+#endif
+#endif
+
+ vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ vp8_build_intra_predictors_mbuv(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+ vp8_transform_mbuvrd(x);
+
+ vp8_quantize_mbuvrd(x);
+
+
+
+ vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h
new file mode 100644
index 000000000..4a43ab275
--- /dev/null
+++ b/vp8/encoder/encodeintra.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef _ENCODEINTRA_H_
+#define _ENCODEINTRA_H_
+#include "onyx_int.h"
+
+void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
+void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
+void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
+void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
+void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+
+#endif
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
new file mode 100644
index 000000000..d82513318
--- /dev/null
+++ b/vp8/encoder/encodemb.c
@@ -0,0 +1,1129 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "reconinter.h"
+#include "quantize.h"
+#include "invtrans.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "dct.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *src_ptr = (*(be->base_src) + be->src);
+ short *diff_ptr = be->src_diff;
+ unsigned char *pred_ptr = bd->predictor;
+ int src_stride = be->src_stride;
+
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+ }
+
+ diff_ptr += pitch;
+ pred_ptr += pitch;
+ src_ptr += src_stride;
+ }
+}
+
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+{
+ short *udiff = diff + 256;
+ short *vdiff = diff + 320;
+ unsigned char *upred = pred + 256;
+ unsigned char *vpred = pred + 320;
+
+ int r, c;
+
+ for (r = 0; r < 8; r++)
+ {
+ for (c = 0; c < 8; c++)
+ {
+ udiff[c] = usrc[c] - upred[c];
+ }
+
+ udiff += 8;
+ upred += 8;
+ usrc += stride;
+ }
+
+ for (r = 0; r < 8; r++)
+ {
+ for (c = 0; c < 8; c++)
+ {
+ vdiff[c] = vsrc[c] - vpred[c];
+ }
+
+ vdiff += 8;
+ vpred += 8;
+ vsrc += stride;
+ }
+}
+
+void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+{
+ int r, c;
+
+ for (r = 0; r < 16; r++)
+ {
+ for (c = 0; c < 16; c++)
+ {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += 16;
+ pred += 16;
+ src += stride;
+ }
+}
+
+static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+}
+
+void vp8_build_dcblock(MACROBLOCK *x)
+{
+ short *src_diff_ptr = &x->src_diff[384];
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ src_diff_ptr[i] = x->coeff[i * 16];
+ }
+}
+
+void vp8_transform_mbuv(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i += 2)
+ {
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ }
+}
+
+void vp8_transform_mbuvrd(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i += 2)
+ {
+ x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ }
+}
+
+void vp8_transform_intra_mby(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ }
+
+ // build dc block from 16 y dc values
+ vp8_build_dcblock(x);
+
+ // do 2nd order transform on the dc block
+ x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+
+}
+
+void vp8_transform_intra_mbyrd(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ }
+
+ // build dc block from 16 y dc values
+ vp8_build_dcblock(x);
+
+ // do 2nd order transform on the dc block
+ x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+}
+
+void vp8_transform_mb(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ }
+
+ // build dc block from 16 y dc values
+ if (x->e_mbd.mbmi.mode != SPLITMV)
+ vp8_build_dcblock(x);
+
+ for (i = 16; i < 24; i += 2)
+ {
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ }
+
+ // do 2nd order transform on the dc block
+ if (x->e_mbd.mbmi.mode != SPLITMV)
+ x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+
+}
+
+void vp8_transform_mby(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ }
+
+ // build dc block from 16 y dc values
+ if (x->e_mbd.mbmi.mode != SPLITMV)
+ {
+ vp8_build_dcblock(x);
+ x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ }
+}
+
+void vp8_transform_mbrd(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ }
+
+ // build dc block from 16 y dc values
+ if (x->e_mbd.mbmi.mode != SPLITMV)
+ vp8_build_dcblock(x);
+
+ for (i = 16; i < 24; i += 2)
+ {
+ x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ }
+
+ // do 2nd order transform on the dc block
+ if (x->e_mbd.mbmi.mode != SPLITMV)
+ x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+}
+
+void vp8_stuff_inter16x16(MACROBLOCK *x)
+{
+ vp8_build_inter_predictors_mb_s(&x->e_mbd);
+ /*
+ // recon = copy from predictors to destination
+ {
+ BLOCKD *b = &x->e_mbd.block[0];
+ unsigned char *pred_ptr = b->predictor;
+ unsigned char *dst_ptr = *(b->base_dst) + b->dst;
+ int stride = b->dst_stride;
+
+ int i;
+ for(i=0;i<16;i++)
+ vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
+
+ b = &x->e_mbd.block[16];
+ pred_ptr = b->predictor;
+ dst_ptr = *(b->base_dst) + b->dst;
+ stride = b->dst_stride;
+
+ for(i=0;i<8;i++)
+ vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
+
+ b = &x->e_mbd.block[20];
+ pred_ptr = b->predictor;
+ dst_ptr = *(b->base_dst) + b->dst;
+ stride = b->dst_stride;
+
+ for(i=0;i<8;i++)
+ vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
+ }
+ */
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+extern const TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+extern const TOKENEXTRA *vp8_dct_value_tokens_ptr;
+extern int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+extern int *vp8_dct_value_cost_ptr;
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+ int c = !type; /* start at coef 0, unless Y with Y2 */
+ int eob = b->eob;
+ int pt ; /* surrounding block/prev coef predictor */
+ int cost = 0;
+ short *qcoeff_ptr = b->qcoeff;
+
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+
+ for (; c < eob; c++)
+ {
+ int v = QC(c);
+ int t = vp8_dct_value_tokens_ptr[v].Token;
+ cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+ cost += vp8_dct_value_cost_ptr[v];
+ pt = vp8_prev_token_class[t];
+ }
+
+# undef QC
+
+ if (c < 16)
+ cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+ return cost;
+}
+
+static int mbycost_coeffs(MACROBLOCK *mb)
+{
+ int cost = 0;
+ int b;
+ TEMP_CONTEXT t;
+ int type = 0;
+
+ MACROBLOCKD *x = &mb->e_mbd;
+
+ vp8_setup_temp_context(&t, x->above_context[Y1CONTEXT], x->left_context[Y1CONTEXT], 4);
+
+ if (x->mbmi.mode == SPLITMV)
+ type = 3;
+
+ for (b = 0; b < 16; b++)
+ cost += cost_coeffs(mb, x->block + b, type,
+ t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+
+ return cost;
+}
+
+#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+void vp8_optimize_b(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd)
+{
+ BLOCK *b = &x->block[i];
+ BLOCKD *bd = &x->e_mbd.block[i];
+ short *dequant_ptr = &bd->dequant[0][0];
+ int nzpos[16] = {0};
+ short saved_qcoefs[16];
+ short saved_dqcoefs[16];
+ int baserate, baseerror, baserd;
+ int rate, error, thisrd;
+ int k;
+ int nzcoefcount = 0;
+ int nc, bestnc = 0;
+ int besteob;
+
+ // count potential coefficient to be optimized
+ for (k = !type; k < 16; k++)
+ {
+ int qcoef = abs(bd->qcoeff[k]);
+ int coef = abs(b->coeff[k]);
+ int dq = dequant_ptr[k];
+
+ if (qcoef && (qcoef * dq > coef) && (qcoef * dq < coef + dq))
+ {
+ nzpos[nzcoefcount] = k;
+ nzcoefcount++;
+ }
+ }
+
+ // if nothing here, do nothing for this block.
+ if (!nzcoefcount)
+ {
+ *a = *l = (bd->eob != !type);
+ return;
+ }
+
+ // save a copy of quantized coefficients
+ vpx_memcpy(saved_qcoefs, bd->qcoeff, 32);
+ vpx_memcpy(saved_dqcoefs, bd->dqcoeff, 32);
+
+ besteob = bd->eob;
+ baserate = cost_coeffs(x, bd, type, a, l);
+ baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+ baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+ for (nc = 1; nc < (1 << nzcoefcount); nc++)
+ {
+ //reset coefficients
+ vpx_memcpy(bd->qcoeff, saved_qcoefs, 32);
+ vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+ for (k = 0; k < nzcoefcount; k++)
+ {
+ int pos = nzpos[k];
+
+ if ((nc & (1 << k)))
+ {
+ int cur_qcoef = bd->qcoeff[pos];
+
+ if (cur_qcoef < 0)
+ {
+ bd->qcoeff[pos]++;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ else
+ {
+ bd->qcoeff[pos]--;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ }
+ }
+
+ {
+ int eob = -1;
+ int rc;
+ int m;
+
+ for (m = 0; m < 16; m++)
+ {
+ rc = vp8_default_zig_zag1d[m];
+
+ if (bd->qcoeff[rc])
+ eob = m;
+ }
+
+ bd->eob = eob + 1;
+ }
+
+ rate = cost_coeffs(x, bd, type, a, l);
+ error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+ thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+ if (thisrd < baserd)
+ {
+ baserd = thisrd;
+ bestnc = nc;
+ besteob = bd->eob;
+ }
+ }
+
+ //reset coefficients
+ vpx_memcpy(bd->qcoeff, saved_qcoefs, 32);
+ vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+ if (bestnc)
+ {
+ for (k = 0; k < nzcoefcount; k++)
+ {
+ int pos = nzpos[k];
+
+ if (bestnc & (1 << k))
+ {
+ int cur_qcoef = bd->qcoeff[pos];
+
+ if (cur_qcoef < 0)
+ {
+ bd->qcoeff[pos]++;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ else
+ {
+ bd->qcoeff[pos]--;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ }
+ }
+
+#if 0
+ {
+ int eob = -1;
+ int rc;
+ int m;
+
+ for (m = 0; m < 16; m++)
+ {
+ rc = vp8_default_zig_zag1d[m];
+
+ if (bd->qcoeff[rc])
+ eob = m;
+ }
+
+ bd->eob = eob + 1;
+ }
+#endif
+ }
+
+#if 1
+ bd->eob = besteob;
+#endif
+#if 0
+ {
+ int eob = -1;
+ int rc;
+ int m;
+
+ for (m = 0; m < 16; m++)
+ {
+ rc = vp8_default_zig_zag1d[m];
+
+ if (bd->qcoeff[rc])
+ eob = m;
+ }
+
+ bd->eob = eob + 1;
+ }
+
+#endif
+ *a = *l = (bd->eob != !type);
+ return;
+}
+
+void vp8_optimize_bplus(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd)
+{
+ BLOCK *b = &x->block[i];
+ BLOCKD *bd = &x->e_mbd.block[i];
+ short *dequant_ptr = &bd->dequant[0][0];
+ int nzpos[16] = {0};
+ short saved_qcoefs[16];
+ short saved_dqcoefs[16];
+ int baserate, baseerror, baserd;
+ int rate, error, thisrd;
+ int k;
+ int nzcoefcount = 0;
+ int nc, bestnc = 0;
+ int besteob;
+
+ // count potential coefficient to be optimized
+ for (k = !type; k < 16; k++)
+ {
+ int qcoef = abs(bd->qcoeff[k]);
+ int coef = abs(b->coeff[k]);
+ int dq = dequant_ptr[k];
+
+ if (qcoef && (qcoef * dq < coef) && (coef < (qcoef * dq + dq)))
+ {
+ nzpos[nzcoefcount] = k;
+ nzcoefcount++;
+ }
+ }
+
+ // if nothing here, do nothing for this block.
+ if (!nzcoefcount)
+ {
+ //do not update context, we need do the other half.
+ //*a = *l = (bd->eob != !type);
+ return;
+ }
+
+ // save a copy of quantized coefficients
+ vpx_memcpy(saved_qcoefs, bd->qcoeff, 32);
+ vpx_memcpy(saved_dqcoefs, bd->dqcoeff, 32);
+
+ besteob = bd->eob;
+ baserate = cost_coeffs(x, bd, type, a, l);
+ baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+ baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+ for (nc = 1; nc < (1 << nzcoefcount); nc++)
+ {
+ //reset coefficients
+ vpx_memcpy(bd->qcoeff, saved_qcoefs, 32);
+ vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+ for (k = 0; k < nzcoefcount; k++)
+ {
+ int pos = nzpos[k];
+
+ if ((nc & (1 << k)))
+ {
+ int cur_qcoef = bd->qcoeff[pos];
+
+ if (cur_qcoef < 0)
+ {
+ bd->qcoeff[pos]--;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ else
+ {
+ bd->qcoeff[pos]++;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ }
+ }
+
+ {
+ int eob = -1;
+ int rc;
+ int m;
+
+ for (m = 0; m < 16; m++)
+ {
+ rc = vp8_default_zig_zag1d[m];
+
+ if (bd->qcoeff[rc])
+ eob = m;
+ }
+
+ bd->eob = eob + 1;
+ }
+
+ rate = cost_coeffs(x, bd, type, a, l);
+ error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+ thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+ if (thisrd < baserd)
+ {
+ baserd = thisrd;
+ bestnc = nc;
+ besteob = bd->eob;
+ }
+ }
+
+ //reset coefficients
+ vpx_memcpy(bd->qcoeff, saved_qcoefs, 32);
+ vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+ if (bestnc)
+ {
+ for (k = 0; k < nzcoefcount; k++)
+ {
+ int pos = nzpos[k];
+
+ if (bestnc & (1 << k))
+ {
+ int cur_qcoef = bd->qcoeff[pos];
+
+ if (cur_qcoef < 0)
+ {
+ bd->qcoeff[pos]++;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ else
+ {
+ bd->qcoeff[pos]--;
+ bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+ }
+ }
+ }
+ }
+
+ bd->eob = besteob;
+ //do not update context, we need do the other half.
+ //*a = *l = (bd->eob != !type);
+ return;
+}
+
+void vp8_optimize_y2b(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd)
+{
+
+ BLOCK *b = &x->block[i];
+ BLOCKD *bd = &x->e_mbd.block[i];
+ short *dequant_ptr = &bd->dequant[0][0];
+
+ int baserate, baseerror, baserd;
+ int rate, error, thisrd;
+ int k;
+
+ if (bd->eob == 0)
+ return;
+
+ baserate = cost_coeffs(x, bd, type, a, l);
+ baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 4;
+ baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+ for (k = 0; k < 16; k++)
+ {
+ int cur_qcoef = bd->qcoeff[k];
+
+ if (!cur_qcoef)
+ continue;
+
+ if (cur_qcoef < 0)
+ {
+ bd->qcoeff[k]++;
+ bd->dqcoeff[k] = bd->qcoeff[k] * dequant_ptr[k];
+ }
+ else
+ {
+ bd->qcoeff[k]--;
+ bd->dqcoeff[k] = bd->qcoeff[k] * dequant_ptr[k];
+ }
+
+ if (bd->qcoeff[k] == 0)
+ {
+ int eob = -1;
+ int rc;
+ int l;
+
+ for (l = 0; l < 16; l++)
+ {
+ rc = vp8_default_zig_zag1d[l];
+
+ if (bd->qcoeff[rc])
+ eob = l;
+ }
+
+ bd->eob = eob + 1;
+ }
+
+ rate = cost_coeffs(x, bd, type, a, l);
+ error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 4;
+ thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+ if (thisrd > baserd)
+ {
+ bd->qcoeff[k] = cur_qcoef;
+ bd->dqcoeff[k] = cur_qcoef * dequant_ptr[k];
+ }
+ else
+ {
+ baserd = thisrd;
+ }
+
+ }
+
+ {
+ int eob = -1;
+ int rc;
+
+ for (k = 0; k < 16; k++)
+ {
+ rc = vp8_default_zig_zag1d[k];
+
+ if (bd->qcoeff[rc])
+ eob = k;
+ }
+
+ bd->eob = eob + 1;
+ }
+
+ return;
+}
+
+
+void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+ int cost = 0;
+ int b;
+ TEMP_CONTEXT t, t2;
+ int type = 0;
+
+ vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4);
+
+ if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED)
+ type = 3;
+
+ for (b = 0; b < 16; b++)
+ {
+ //vp8_optimize_bplus(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+ vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+ }
+
+ vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2);
+ vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2);
+
+ for (b = 16; b < 20; b++)
+ {
+ //vp8_optimize_bplus(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+ vp8_optimize_b(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+ }
+
+ for (b = 20; b < 24; b++)
+ {
+ //vp8_optimize_bplus(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b]);
+ vp8_optimize_b(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd);
+ }
+}
+
+
+
+void vp8_super_slow_yquant_optimization(MACROBLOCK *x, int type, const VP8_ENCODER_RTCD *rtcd)
+{
+ BLOCK *b = &x->block[0];
+ BLOCKD *bd = &x->e_mbd.block[0];
+ short *dequant_ptr = &bd->dequant[0][0];
+ struct
+ {
+ int block;
+ int pos;
+ } nzpos[256];
+ short saved_qcoefs[256];
+ short saved_dqcoefs[256];
+ short *coef_ptr = x->coeff;
+ short *qcoef_ptr = x->e_mbd.qcoeff;
+ short *dqcoef_ptr = x->e_mbd.dqcoeff;
+
+ int baserate, baseerror, baserd;
+ int rate, error, thisrd;
+ int i, k;
+ int nzcoefcount = 0;
+ int nc, bestnc = 0;
+ int besteob;
+
+ //this code has assumption in macroblock coeff buffer layout
+ for (i = 0; i < 16; i++)
+ {
+ // count potential coefficient to be optimized
+ for (k = !type; k < 16; k++)
+ {
+ int qcoef = abs(qcoef_ptr[i*16 + k]);
+ int coef = abs(coef_ptr[i*16 + k]);
+ int dq = dequant_ptr[k];
+
+ if (qcoef && (qcoef * dq > coef) && (qcoef * dq < coef + dq))
+ {
+ nzpos[nzcoefcount].block = i;
+ nzpos[nzcoefcount].pos = k;
+ nzcoefcount++;
+ }
+ }
+ }
+
+ // if nothing here, do nothing for this macro_block.
+ if (!nzcoefcount || nzcoefcount > 15)
+ {
+ return;
+ }
+
+ /******************************************************************************
+ looking from each coeffient's perspective, each identifed coefficent above could
+ have 2 values:roundeddown(x) and roundedup(x). Therefore the total number of
+ different states is less than 2**nzcoefcount.
+ ******************************************************************************/
+ // save the qunatized coefficents and dequantized coefficicents
+ vpx_memcpy(saved_qcoefs, x->e_mbd.qcoeff, 256);
+ vpx_memcpy(saved_dqcoefs, x->e_mbd.dqcoeff, 256);
+
+ baserate = mbycost_coeffs(x);
+ baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, !type);
+ baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+ for (nc = 1; nc < (1 << nzcoefcount); nc++)
+ {
+ //reset coefficients
+ vpx_memcpy(x->e_mbd.qcoeff, saved_qcoefs, 256);
+ vpx_memcpy(x->e_mbd.dqcoeff, saved_dqcoefs, 256);
+
+ for (k = 0; k < nzcoefcount; k++)
+ {
+ int bk = nzpos[k].block;
+ int pos = nzpos[k].pos;
+ int mbkpos = bk * 16 + pos;
+
+ if ((nc & (1 << k)))
+ {
+ int cur_qcoef = x->e_mbd.qcoeff[mbkpos];
+
+ if (cur_qcoef < 0)
+ {
+ x->e_mbd.qcoeff[mbkpos]++;
+ x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+ }
+ else
+ {
+ x->e_mbd.qcoeff[mbkpos]--;
+ x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+ }
+ }
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+ {
+ int eob = -1;
+ int rc;
+ int l;
+
+ for (l = 0; l < 16; l++)
+ {
+ rc = vp8_default_zig_zag1d[l];
+
+ if (bd->qcoeff[rc])
+ eob = l;
+ }
+
+ bd->eob = eob + 1;
+ }
+ }
+
+ rate = mbycost_coeffs(x);
+ error = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, !type);;
+ thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+ if (thisrd < baserd)
+ {
+ baserd = thisrd;
+ bestnc = nc;
+ besteob = bd->eob;
+ }
+ }
+
+ //reset coefficients
+ vpx_memcpy(x->e_mbd.qcoeff, saved_qcoefs, 256);
+ vpx_memcpy(x->e_mbd.dqcoeff, saved_dqcoefs, 256);
+
+ if (bestnc)
+ {
+ for (k = 0; k < nzcoefcount; k++)
+ {
+ int bk = nzpos[k].block;
+ int pos = nzpos[k].pos;
+ int mbkpos = bk * 16 + pos;
+
+ if ((nc & (1 << k)))
+ {
+ int cur_qcoef = x->e_mbd.qcoeff[mbkpos];
+
+ if (cur_qcoef < 0)
+ {
+ x->e_mbd.qcoeff[mbkpos]++;
+ x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+ }
+ else
+ {
+ x->e_mbd.qcoeff[mbkpos]--;
+ x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+ {
+ int eob = -1;
+ int rc;
+ int l;
+
+ for (l = 0; l < 16; l++)
+ {
+ rc = vp8_default_zig_zag1d[l];
+
+ if (bd->qcoeff[rc])
+ eob = l;
+ }
+
+ bd->eob = eob + 1;
+ }
+ }
+
+ return;
+}
+
+static void vp8_find_mb_skip_coef(MACROBLOCK *x)
+{
+ int i;
+
+ x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+ if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+ }
+
+ for (i = 16; i < 25; i++)
+ {
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+ else
+ {
+ for (i = 0; i < 24; i++)
+ {
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+}
+
+
+void vp8_optimize_mb_slow(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+ int cost = 0;
+ int b;
+ TEMP_CONTEXT t, t2;
+ int type = 0;
+
+
+ vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4);
+
+ if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED)
+ type = 3;
+
+ vp8_super_slow_yquant_optimization(x, type, rtcd);
+ /*
+ for(b=0;b<16;b++)
+ {
+ vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+ }
+ */
+
+ vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2);
+
+ for (b = 16; b < 20; b++)
+ {
+ vp8_optimize_b(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+ }
+
+ vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2);
+
+ for (b = 20; b < 24; b++)
+ {
+ vp8_optimize_b(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd);
+ }
+}
+
+
+void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+ int cost = 0;
+ int b;
+ TEMP_CONTEXT t;
+ int type = 0;
+
+ if (!x->e_mbd.above_context[Y1CONTEXT])
+ return;
+
+ if (!x->e_mbd.left_context[Y1CONTEXT])
+ return;
+
+ vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4);
+
+ if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED)
+ type = 3;
+
+ for (b = 0; b < 16; b++)
+ {
+ vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+ }
+
+}
+
+void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+ int cost = 0;
+ int b;
+ TEMP_CONTEXT t, t2;
+ int type = 0;
+
+ if (!x->e_mbd.above_context[UCONTEXT])
+ return;
+
+ if (!x->e_mbd.left_context[UCONTEXT])
+ return;
+
+ if (!x->e_mbd.above_context[VCONTEXT])
+ return;
+
+ if (!x->e_mbd.left_context[VCONTEXT])
+ return;
+
+
+ vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2);
+ vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2);
+
+ for (b = 16; b < 20; b++)
+ {
+ vp8_optimize_b(x, b, vp8_block2type[b],
+ t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+
+ }
+
+ for (b = 20; b < 24; b++)
+ {
+ vp8_optimize_b(x, b, vp8_block2type[b],
+ t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd);
+ }
+
+}
+#endif
+
+void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ vp8_build_inter_predictors_mb(&x->e_mbd);
+
+ vp8_subtract_mb(rtcd, x);
+
+ vp8_transform_mb(x);
+
+ vp8_quantize_mb(x);
+
+#if !(CONFIG_REALTIME_ONLY)
+#if 1
+
+ if (x->optimize && x->rddiv > 1)
+ {
+ vp8_optimize_mb(x, rtcd);
+ vp8_find_mb_skip_coef(x);
+ }
+
+#endif
+#endif
+
+ vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon16x16mb(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+
+/* this funciton is used by first pass only */
+void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ vp8_build_inter_predictors_mby(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+
+ vp8_transform_mby(x);
+
+ vp8_quantize_mby(x);
+
+ vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+
+void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ vp8_build_inter_predictors_mbuv(&x->e_mbd);
+
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+ vp8_transform_mbuv(x);
+
+ vp8_quantize_mbuv(x);
+
+ vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+ vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+
+void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+ vp8_build_inter_predictors_mbuv(&x->e_mbd);
+ ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+ vp8_transform_mbuvrd(x);
+
+ vp8_quantize_mbuvrd(x);
+
+}
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
new file mode 100644
index 000000000..91ca8f552
--- /dev/null
+++ b/vp8/encoder/encodemb.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMB_H
+#define __INC_ENCODEMB_H
+
+#include "vpx_ports/config.h"
+#include "block.h"
+
+#define prototype_mberr(sym) \
+ int (sym)(MACROBLOCK *mb, int dc)
+
+#define prototype_berr(sym) \
+ int (sym)(short *coeff, short *dqcoeff)
+
+#define prototype_mbuverr(sym) \
+ int (sym)(MACROBLOCK *mb)
+
+#define prototype_subb(sym) \
+ void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
+
+#define prototype_submby(sym) \
+ void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride)
+
+#define prototype_submbuv(sym) \
+ void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
+ unsigned char *pred, int stride)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/encodemb_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/encodemb_arm.h"
+#endif
+
+#ifndef vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_c
+#endif
+extern prototype_berr(vp8_encodemb_berr);
+
+#ifndef vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_c
+#endif
+extern prototype_mberr(vp8_encodemb_mberr);
+
+#ifndef vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_c
+#endif
+extern prototype_mbuverr(vp8_encodemb_mbuverr);
+
+#ifndef vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_c
+#endif
+extern prototype_subb(vp8_encodemb_subb);
+
+#ifndef vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_c
+#endif
+extern prototype_submby(vp8_encodemb_submby);
+
+#ifndef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_c
+#endif
+extern prototype_submbuv(vp8_encodemb_submbuv);
+
+
+typedef struct
+{
+ prototype_berr(*berr);
+ prototype_mberr(*mberr);
+ prototype_mbuverr(*mbuverr);
+ prototype_subb(*subb);
+ prototype_submby(*submby);
+ prototype_submbuv(*submbuv);
+} vp8_encodemb_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define ENCODEMB_INVOKE(ctx,fn) vp8_encodemb_##fn
+#endif
+
+
+
+#include "onyx_int.h"
+struct VP8_ENCODER_RTCD;
+void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+extern void vp8_stuff_inter16x16(MACROBLOCK *x);
+
+void vp8_build_dcblock(MACROBLOCK *b);
+void vp8_transform_mb(MACROBLOCK *mb);
+void vp8_transform_mbuv(MACROBLOCK *x);
+void vp8_transform_mbuvrd(MACROBLOCK *x);
+void vp8_transform_intra_mby(MACROBLOCK *x);
+void vp8_transform_intra_mbyrd(MACROBLOCK *x);
+void Encode16x16Y(MACROBLOCK *x);
+void Encode16x16UV(MACROBLOCK *x);
+void vp8_encode_inter16x16uv(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+void vp8_encode_inter16x16y(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+#endif
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
new file mode 100644
index 000000000..f287edc18
--- /dev/null
+++ b/vp8/encoder/encodemv.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "common.h"
+#include "encodemv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+static void encode_mvcomponent(
+ vp8_writer *const w,
+ const int v,
+ const struct mv_context *mvc
+)
+{
+ const vp8_prob *p = mvc->prob;
+ const int x = v < 0 ? -v : v;
+
+ if (x < mvnum_short) // Small
+ {
+ vp8_write(w, 0, p [mvpis_short]);
+ vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
+
+ if (!x)
+ return; // no sign bit
+ }
+ else // Large
+ {
+ int i = 0;
+
+ vp8_write(w, 1, p [mvpis_short]);
+
+ do
+ vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+ while (++i < 3);
+
+ i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
+
+ do
+ vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+ while (--i > 3);
+
+ if (x & 0xFFF0)
+ vp8_write(w, (x >> 3) & 1, p [MVPbits + 3]);
+ }
+
+ vp8_write(w, v < 0, p [MVPsign]);
+}
+#if 0
+static int max_mv_r = 0;
+static int max_mv_c = 0;
+#endif
+void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc)
+{
+
+#if 0
+ {
+ if (abs(mv->row >> 1) > max_mv_r)
+ {
+ FILE *f = fopen("maxmv.stt", "a");
+ max_mv_r = abs(mv->row >> 1);
+ fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1));
+
+ if ((abs(mv->row) / 2) != max_mv_r)
+ fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2);
+
+ fclose(f);
+ }
+
+ if (abs(mv->col >> 1) > max_mv_c)
+ {
+ FILE *f = fopen("maxmv.stt", "a");
+ fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1));
+ max_mv_c = abs(mv->col >> 1);
+ fclose(f);
+ }
+ }
+#endif
+
+ encode_mvcomponent(w, mv->row >> 1, &mvc[0]);
+ encode_mvcomponent(w, mv->col >> 1, &mvc[1]);
+}
+
+
+static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
+{
+ const vp8_prob *p = mvc->prob;
+ const int x = v; //v<0? -v:v;
+ unsigned int cost;
+
+ if (x < mvnum_short)
+ {
+ cost = vp8_cost_zero(p [mvpis_short])
+ + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3);
+
+ if (!x)
+ return cost;
+ }
+ else
+ {
+ int i = 0;
+ cost = vp8_cost_one(p [mvpis_short]);
+
+ do
+ cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+ while (++i < 3);
+
+ i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
+
+ do
+ cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+ while (--i > 3);
+
+ if (x & 240)
+ cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
+ }
+
+ return cost; // + vp8_cost_bit( p [MVPsign], v < 0);
+}
+//#define M_LOG2_E 0.693147180559945309417
+//#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+{
+ int i = 1; //-mv_max;
+ unsigned int cost0 = 0;
+ unsigned int cost1 = 0;
+
+ vp8_clear_system_state();
+#if 0
+ mvsadcost [0] [0] = 300;
+ mvsadcost [1] [0] = 300;
+
+ do
+ {
+ double z = 256 * (2 * (log2f(2 * i) + .6));
+ mvsadcost [0][i] = (int) z;
+ mvsadcost [1][i] = (int) z;
+ mvsadcost [0][-i] = (int) z;
+ mvsadcost [1][-i] = (int) z;
+ }
+ while (++i <= mv_max);
+
+#endif
+
+ i = 1;
+
+ if (mvc_flag[0])
+ {
+ mvcost [0] [0] = cost_mvcomponent(0, &mvc[0]);
+
+ do
+ {
+ //mvcost [0] [i] = cost_mvcomponent( i, &mvc[0]);
+ cost0 = cost_mvcomponent(i, &mvc[0]);
+
+ mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]);
+ mvcost [0] [-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]);
+ }
+ while (++i <= mv_max);
+ }
+
+ i = 1;
+
+ if (mvc_flag[1])
+ {
+ mvcost [1] [0] = cost_mvcomponent(0, &mvc[1]);
+
+ do
+ {
+ //mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
+ cost1 = cost_mvcomponent(i, &mvc[1]);
+
+ mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]);
+ mvcost [1] [-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]);
+ }
+ while (++i <= mv_max);
+ }
+
+ /*
+ i=-mv_max;
+ do
+ {
+ mvcost [0] [i] = cost_mvcomponent( i, mvc[0]);
+ mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
+ }
+ while( ++i <= mv_max);
+ */
+}
+
+
+// Motion vector probability table update depends on benefit.
+// Small correction allows for the fact that an update to an MV probability
+// may have benefit in subsequent frames as well as the current one.
+
+#define MV_PROB_UPDATE_CORRECTION -1
+
+
+__inline static void calc_prob(vp8_prob *p, const unsigned int ct[2])
+{
+ const unsigned int tot = ct[0] + ct[1];
+
+ if (tot)
+ {
+ const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+ *p = x ? x : 1;
+ }
+}
+
+static void update(
+ vp8_writer *const w,
+ const unsigned int ct[2],
+ vp8_prob *const cur_p,
+ const vp8_prob new_p,
+ const vp8_prob update_p,
+ int *updated
+)
+{
+ const int cur_b = vp8_cost_branch(ct, *cur_p);
+ const int new_b = vp8_cost_branch(ct, new_p);
+ const int cost = 7 + MV_PROB_UPDATE_CORRECTION + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8);
+
+ if (cur_b - new_b > cost)
+ {
+ *cur_p = new_p;
+ vp8_write(w, 1, update_p);
+ vp8_write_literal(w, new_p >> 1, 7);
+ *updated = 1;
+
+ }
+ else
+ vp8_write(w, 0, update_p);
+}
+
+static void write_component_probs(
+ vp8_writer *const w,
+ struct mv_context *cur_mvc,
+ const struct mv_context *default_mvc_,
+ const struct mv_context *update_mvc,
+ const unsigned int events [MVvals],
+ unsigned int rc,
+ int *updated
+)
+{
+ vp8_prob *Pcur = cur_mvc->prob;
+ const vp8_prob *default_mvc = default_mvc_->prob;
+ const vp8_prob *Pupdate = update_mvc->prob;
+ unsigned int is_short_ct[2], sign_ct[2];
+
+ unsigned int bit_ct [mvlong_width] [2];
+
+ unsigned int short_ct [mvnum_short];
+ unsigned int short_bct [mvnum_short-1] [2];
+
+ vp8_prob Pnew [MVPcount];
+
+ (void) rc;
+ vp8_copy_array(Pnew, default_mvc, MVPcount);
+
+ vp8_zero(is_short_ct)
+ vp8_zero(sign_ct)
+ vp8_zero(bit_ct)
+ vp8_zero(short_ct)
+ vp8_zero(short_bct)
+
+
+ //j=0
+ {
+ int j = 0;
+
+ const int c = events [mv_max];
+
+ is_short_ct [0] += c; // Short vector
+ short_ct [0] += c; // Magnitude distribution
+ }
+
+ //j: 1 ~ mv_max (1023)
+ {
+ int j = 1;
+
+ do
+ {
+ const int c1 = events [mv_max + j]; //positive
+ const int c2 = events [mv_max - j]; //negative
+ const int c = c1 + c2;
+ int a = j;
+
+ sign_ct [0] += c1;
+ sign_ct [1] += c2;
+
+ if (a < mvnum_short)
+ {
+ is_short_ct [0] += c; // Short vector
+ short_ct [a] += c; // Magnitude distribution
+ }
+ else
+ {
+ int k = mvlong_width - 1;
+ is_short_ct [1] += c; // Long vector
+
+ /* bit 3 not always encoded. */
+ do
+ bit_ct [k] [(a >> k) & 1] += c;
+
+ while (--k >= 0);
+ }
+ }
+ while (++j <= mv_max);
+ }
+
+ /*
+ {
+ int j = -mv_max;
+ do
+ {
+
+ const int c = events [mv_max + j];
+ int a = j;
+
+ if( j < 0)
+ {
+ sign_ct [1] += c;
+ a = -j;
+ }
+ else if( j)
+ sign_ct [0] += c;
+
+ if( a < mvnum_short)
+ {
+ is_short_ct [0] += c; // Short vector
+ short_ct [a] += c; // Magnitude distribution
+ }
+ else
+ {
+ int k = mvlong_width - 1;
+ is_short_ct [1] += c; // Long vector
+
+ // bit 3 not always encoded.
+
+ do
+ bit_ct [k] [(a >> k) & 1] += c;
+ while( --k >= 0);
+ }
+ } while( ++j <= mv_max);
+ }
+ */
+
+ calc_prob(Pnew + mvpis_short, is_short_ct);
+
+ calc_prob(Pnew + MVPsign, sign_ct);
+
+ {
+ vp8_prob p [mvnum_short - 1]; /* actually only need branch ct */
+ int j = 0;
+
+ vp8_tree_probs_from_distribution(
+ 8, vp8_small_mvencodings, vp8_small_mvtree,
+ p, short_bct, short_ct,
+ 256, 1
+ );
+
+ do
+ calc_prob(Pnew + MVPshort + j, short_bct[j]);
+
+ while (++j < mvnum_short - 1);
+ }
+
+ {
+ int j = 0;
+
+ do
+ calc_prob(Pnew + MVPbits + j, bit_ct[j]);
+
+ while (++j < mvlong_width);
+ }
+
+ update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated);
+
+ update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated);
+
+ {
+ const vp8_prob *const new_p = Pnew + MVPshort;
+ vp8_prob *const cur_p = Pcur + MVPshort;
+
+ int j = 0;
+
+ do
+
+ update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+ while (++j < mvnum_short - 1);
+ }
+
+ {
+ const vp8_prob *const new_p = Pnew + MVPbits;
+ vp8_prob *const cur_p = Pcur + MVPbits;
+
+ int j = 0;
+
+ do
+
+ update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+ while (++j < mvlong_width);
+ }
+}
+
+void vp8_write_mvprobs(VP8_COMP *cpi)
+{
+ vp8_writer *const w = & cpi->bc;
+ MV_CONTEXT *mvc = cpi->common.fc.mvc;
+ int flags[2] = {0, 0};
+#ifdef ENTROPY_STATS
+ active_section = 4;
+#endif
+ write_component_probs(
+ w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->MVcount[0], 0, &flags[0]
+ );
+ write_component_probs(
+ w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1], cpi->MVcount[1], 1, &flags[1]
+ );
+
+ if (flags[0] || flags[1])
+ vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+
+#ifdef ENTROPY_STATS
+ active_section = 5;
+#endif
+}
diff --git a/vp8/encoder/encodemv.h b/vp8/encoder/encodemv.h
new file mode 100644
index 000000000..1c1f450a0
--- /dev/null
+++ b/vp8/encoder/encodemv.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMV_H
+#define __INC_ENCODEMV_H
+
+#include "onyx_int.h"
+
+void vp8_write_mvprobs(VP8_COMP *);
+void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
+void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+
+#endif
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
new file mode 100644
index 000000000..a0b50d2a1
--- /dev/null
+++ b/vp8/encoder/ethreading.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+#include "threading.h"
+#include "common.h"
+#include "extend.h"
+
+
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
+extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+
+static
+THREAD_FUNCTION thread_encoding_proc(void *p_data)
+{
+#if CONFIG_MULTITHREAD
+ int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
+ VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+ MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
+ ENTROPY_CONTEXT mb_row_left_context[4][4];
+
+ //printf("Started thread %d\n", ithread);
+
+ while (1)
+ {
+ if (cpi->b_multi_threaded == 0)
+ break;
+
+ //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
+ if (sem_wait(&cpi->h_event_mbrencoding[ithread]) == 0)
+ {
+ if (cpi->b_multi_threaded == FALSE) // we're shutting down
+ break;
+ else
+ {
+ VP8_COMMON *cm = &cpi->common;
+ int mb_row = mbri->mb_row;
+ MACROBLOCK *x = &mbri->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ TOKENEXTRA **tp = &mbri->tp;
+ int *segment_counts = mbri->segment_counts;
+ int *totalrate = &mbri->totalrate;
+
+ {
+ int i;
+ int recon_yoffset, recon_uvoffset;
+ int mb_col;
+ int recon_y_stride = cm->last_frame.y_stride;
+ int recon_uv_stride = cm->last_frame.uv_stride;
+ volatile int *last_row_current_mb_col;
+
+ if (ithread > 0)
+ last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
+ else
+ last_row_current_mb_col = &cpi->current_mb_col_main;
+
+ // reset above block coeffs
+ xd->above_context[Y1CONTEXT] = cm->above_context[Y1CONTEXT];
+ xd->above_context[UCONTEXT ] = cm->above_context[UCONTEXT ];
+ xd->above_context[VCONTEXT ] = cm->above_context[VCONTEXT ];
+ xd->above_context[Y2CONTEXT] = cm->above_context[Y2CONTEXT];
+ xd->left_context = mb_row_left_context;
+
+ vp8_zero(mb_row_left_context);
+
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+
+ cpi->tplist[mb_row].start = *tp;
+
+ //printf("Thread mb_row = %d\n", mb_row);
+
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int seg_map_index = (mb_row * cm->mb_cols);
+
+ while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != cm->mb_cols - 1)
+ {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+
+ // Distance of Mb to the various image edges.
+ // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+ // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+ xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ // Is segmentation enabled
+ // MB level adjutment to quantizer
+ if (xd->segmentation_enabled)
+ {
+ // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+ if (cpi->segmentation_map[seg_map_index+mb_col] <= 3)
+ xd->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col];
+ else
+ xd->mbmi.segment_id = 0;
+
+ vp8cx_mb_init_quantizer(cpi, x);
+ }
+ else
+ xd->mbmi.segment_id = 0; // Set to Segment 0 by default
+
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+#ifdef MODE_STATS
+ y_modes[xd->mbmi.mode] ++;
+#endif
+ }
+ else
+ {
+ *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+
+#ifdef MODE_STATS
+ inter_y_modes[xd->mbmi.mode] ++;
+
+ if (xd->mbmi.mode == SPLITMV)
+ {
+ int b;
+
+ for (b = 0; b < xd->mbmi.partition_count; b++)
+ {
+ inter_b_modes[xd->mbmi.partition_bmi[b].mode] ++;
+ }
+ }
+
+#endif
+
+ // Count of last ref frame 0,0 useage
+ if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME))
+ cpi->inter_zz_count ++;
+
+ }
+
+ cpi->tplist[mb_row].stop = *tp;
+
+ xd->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb
+
+ // store macroblock mode info into context array
+ vpx_memcpy(&xd->mode_info_context->mbmi, &xd->mbmi, sizeof(xd->mbmi));
+
+ for (i = 0; i < 16; i++)
+ vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+
+ // adjust to the next column of macroblocks
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ // Keep track of segment useage
+ segment_counts[xd->mbmi.segment_id] ++;
+
+ // skip to next mb
+ xd->mode_info_context++;
+
+ xd->above_context[Y1CONTEXT] += 4;
+ xd->above_context[UCONTEXT ] += 2;
+ xd->above_context[VCONTEXT ] += 2;
+ xd->above_context[Y2CONTEXT] ++;
+
+ cpi->mb_row_ei[ithread].current_mb_col = mb_col;
+
+ }
+
+ //extend the recon for intra prediction
+ vp8_extend_mb_row(
+ &cm->new_frame,
+ xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8,
+ xd->dst.v_buffer + 8);
+
+ // this is to account for the border
+ xd->mode_info_context++;
+
+ x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+ xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+
+ if (ithread == (cpi->encoding_thread_count - 1) || mb_row == cm->mb_rows - 1)
+ {
+ //SetEvent(cpi->h_event_main);
+ sem_post(&cpi->h_event_main);
+ }
+
+ }
+
+ }
+ }
+ }
+
+#else
+ (void) p_data;
+#endif
+
+ //printf("exit thread %d\n", ithread);
+ return 0;
+}
+
+static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
+{
+
+ MACROBLOCK *x = mbsrc;
+ MACROBLOCK *z = mbdst;
+ int i;
+
+ z->ss = x->ss;
+ z->ss_count = x->ss_count;
+ z->searches_per_step = x->searches_per_step;
+ z->errorperbit = x->errorperbit;
+
+ z->sadperbit16 = x->sadperbit16;
+ z->sadperbit4 = x->sadperbit4;
+ z->errthresh = x->errthresh;
+ z->rddiv = x->rddiv;
+ z->rdmult = x->rdmult;
+
+ /*
+ z->mv_col_min = x->mv_col_min;
+ z->mv_col_max = x->mv_col_max;
+ z->mv_row_min = x->mv_row_min;
+ z->mv_row_max = x->mv_row_max;
+ z->vector_range = x->vector_range ;
+ */
+
+ z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4;
+ z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4;
+ z->short_fdct4x4rd = x->short_fdct4x4rd;
+ z->short_fdct8x4rd = x->short_fdct8x4rd;
+ z->short_fdct8x4rd = x->short_fdct8x4rd;
+ z->vp8_short_fdct4x4_ptr = x->vp8_short_fdct4x4_ptr;
+ z->short_walsh4x4 = x->short_walsh4x4;
+ z->quantize_b = x->quantize_b;
+ z->quantize_brd = x->quantize_brd;
+
+ /*
+ z->mvc = x->mvc;
+ z->src.y_buffer = x->src.y_buffer;
+ z->src.u_buffer = x->src.u_buffer;
+ z->src.v_buffer = x->src.v_buffer;
+ */
+
+
+ vpx_memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts));
+ z->mvcost[0] = &z->mvcosts[0][mv_max+1];
+ z->mvcost[1] = &z->mvcosts[1][mv_max+1];
+ z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1];
+ z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1];
+
+
+ vpx_memcpy(z->token_costs, x->token_costs, sizeof(x->token_costs));
+ vpx_memcpy(z->inter_bmode_costs, x->inter_bmode_costs, sizeof(x->inter_bmode_costs));
+ //memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts));
+ //memcpy(z->mvcost, x->mvcost, sizeof(x->mvcost));
+ vpx_memcpy(z->mbmode_cost, x->mbmode_cost, sizeof(x->mbmode_cost));
+ vpx_memcpy(z->intra_uv_mode_cost, x->intra_uv_mode_cost, sizeof(x->intra_uv_mode_cost));
+ vpx_memcpy(z->bmode_costs, x->bmode_costs, sizeof(x->bmode_costs));
+
+ for (i = 0; i < 25; i++)
+ {
+ z->block[i].quant = x->block[i].quant;
+ z->block[i].zbin = x->block[i].zbin;
+ z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost;
+ z->block[i].round = x->block[i].round;
+ /*
+ z->block[i].src = x->block[i].src;
+ */
+ z->block[i].src_stride = x->block[i].src_stride;
+ z->block[i].force_empty = x->block[i].force_empty;
+
+ }
+
+ {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MACROBLOCKD *zd = &z->e_mbd;
+
+ /*
+ zd->mode_info_context = xd->mode_info_context;
+ zd->mode_info = xd->mode_info;
+
+ zd->mode_info_stride = xd->mode_info_stride;
+ zd->frame_type = xd->frame_type;
+ zd->up_available = xd->up_available ;
+ zd->left_available = xd->left_available;
+ zd->left_context = xd->left_context;
+ zd->last_frame_dc = xd->last_frame_dc;
+ zd->last_frame_dccons = xd->last_frame_dccons;
+ zd->gold_frame_dc = xd->gold_frame_dc;
+ zd->gold_frame_dccons = xd->gold_frame_dccons;
+ zd->mb_to_left_edge = xd->mb_to_left_edge;
+ zd->mb_to_right_edge = xd->mb_to_right_edge;
+ zd->mb_to_top_edge = xd->mb_to_top_edge ;
+ zd->mb_to_bottom_edge = xd->mb_to_bottom_edge;
+ zd->gf_active_ptr = xd->gf_active_ptr;
+ zd->frames_since_golden = xd->frames_since_golden;
+ zd->frames_till_alt_ref_frame = xd->frames_till_alt_ref_frame;
+ */
+ zd->subpixel_predict = xd->subpixel_predict;
+ zd->subpixel_predict8x4 = xd->subpixel_predict8x4;
+ zd->subpixel_predict8x8 = xd->subpixel_predict8x8;
+ zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
+ zd->segmentation_enabled = xd->segmentation_enabled;
+ zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+ vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+ /*
+ memcpy(zd->above_context, xd->above_context, sizeof(xd->above_context));
+ memcpy(zd->mb_segment_tree_probs, xd->mb_segment_tree_probs, sizeof(xd->mb_segment_tree_probs));
+ memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+ */
+ for (i = 0; i < 25; i++)
+ {
+ zd->block[i].dequant = xd->block[i].dequant;
+ }
+ }
+}
+
+
+void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+ MACROBLOCK *x,
+ MB_ROW_COMP *mbr_ei,
+ int mb_row,
+ int count
+ )
+{
+
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+ int i;
+ (void) mb_row;
+
+ for (i = 0; i < count; i++)
+ {
+ MACROBLOCK *mb = & mbr_ei[i].mb;
+ MACROBLOCKD *mbd = &mb->e_mbd;
+
+ mbd->subpixel_predict = xd->subpixel_predict;
+ mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
+ mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
+ mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
+#if CONFIG_RUNTIME_CPU_DETECT
+ mbd->rtcd = xd->rtcd;
+#endif
+ mbd->gf_active_ptr = xd->gf_active_ptr;
+
+ mb->vector_range = 32;
+
+ vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+ mbr_ei[i].totalrate = 0;
+
+ mbd->mode_info = cm->mi - 1;
+ mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1);
+ mbd->mode_info_stride = cm->mode_info_stride;
+
+ mbd->frame_type = cm->frame_type;
+
+ mbd->frames_since_golden = cm->frames_since_golden;
+ mbd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+
+ mb->src = * cpi->Source;
+ mbd->pre = cm->last_frame;
+ mbd->dst = cm->new_frame;
+
+ mb->src.y_buffer += 16 * x->src.y_stride * (i + 1);
+ mb->src.u_buffer += 8 * x->src.uv_stride * (i + 1);
+ mb->src.v_buffer += 8 * x->src.uv_stride * (i + 1);
+
+
+ vp8_build_block_offsets(mb);
+
+ vp8_setup_block_dptrs(mbd);
+
+ vp8_setup_block_ptrs(mb);
+
+ mb->rddiv = cpi->RDDIV;
+ mb->rdmult = cpi->RDMULT;
+
+ mbd->mbmi.mode = DC_PRED;
+ mbd->mbmi.uv_mode = DC_PRED;
+
+ mbd->left_context = cm->left_context;
+ mb->mvc = cm->fc.mvc;
+
+ setup_mbby_copy(&mbr_ei[i].mb, x);
+
+ }
+}
+
+
+void vp8cx_create_encoder_threads(VP8_COMP *cpi)
+{
+ cpi->b_multi_threaded = 0;
+
+ cpi->processor_core_count = 32; //vp8_get_proc_core_count();
+
+ CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
+
+#if CONFIG_MULTITHREAD
+
+ if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+ {
+ int ithread;
+
+ if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
+ cpi->encoding_thread_count = cpi->processor_core_count - 1;
+ else
+ cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
+
+
+ CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
+ CHECK_MEM_ERROR(cpi->h_event_mbrencoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
+ CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
+ vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
+ CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
+ //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
+ sem_init(&cpi->h_event_main, 0, 0);
+
+ cpi->b_multi_threaded = 1;
+
+ //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
+
+ for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
+ {
+ //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
+ sem_init(&cpi->h_event_mbrencoding[ithread], 0, 0);
+ cpi->en_thread_data[ithread].ithread = ithread;
+ cpi->en_thread_data[ithread].ptr1 = (void *)cpi;
+ cpi->en_thread_data[ithread].ptr2 = (void *)&cpi->mb_row_ei[ithread];
+
+ //printf(" call begin thread %d \n", ithread);
+
+ //cpi->h_encoding_thread[ithread] = (HANDLE)_beginthreadex(
+ // NULL, // security
+ // 0, // stksize
+ // thread_encoding_proc,
+ // (&cpi->en_thread_data[ithread]), // Thread data
+ // 0,
+ // NULL);
+
+ pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, (&cpi->en_thread_data[ithread]));
+
+ }
+
+ }
+
+#endif
+}
+
+void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
+{
+#if CONFIG_MULTITHREAD
+
+ if (cpi->b_multi_threaded)
+ {
+ //shutdown other threads
+ cpi->b_multi_threaded = 0;
+ {
+ int i;
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ //SetEvent(cpi->h_event_mbrencoding[i]);
+ sem_post(&cpi->h_event_mbrencoding[i]);
+ pthread_join(cpi->h_encoding_thread[i], 0);
+ }
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ sem_destroy(&cpi->h_event_mbrencoding[i]);
+ }
+ //free thread related resources
+ vpx_free(cpi->h_event_mbrencoding);
+ vpx_free(cpi->h_encoding_thread);
+ vpx_free(cpi->mb_row_ei);
+ vpx_free(cpi->en_thread_data);
+ }
+
+#endif
+ vpx_free(cpi->tplist);
+}
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
new file mode 100644
index 000000000..c519080b2
--- /dev/null
+++ b/vp8/encoder/firstpass.c
@@ -0,0 +1,2512 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "math.h"
+#include "limits.h"
+#include "block.h"
+#include "onyx_int.h"
+#include "variance.h"
+#include "encodeintra.h"
+#include "setupintrarecon.h"
+#include "mcomp.h"
+#include "vpx_scale/vpxscale.h"
+#include "encodemb.h"
+#include "extend.h"
+#include "systemdependent.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "swapyv12buffer.h"
+#include <stdio.h>
+#include "rdopt.h"
+#include "quant_common.h"
+#include "encodemv.h"
+
+//#define OUTPUT_FPF 1
+//#define FIRSTPASS_MM 1
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
+extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
+extern void vp8_alloc_compressor_data(VP8_COMP *cpi);
+
+//#define GFQ_ADJUSTMENT (40 + ((15*Q)/10))
+//#define GFQ_ADJUSTMENT (80 + ((15*Q)/10))
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
+
+extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
+
+#define IIFACTOR 1.4
+#define IIKFACTOR1 1.40
+#define IIKFACTOR2 1.5
+#define RMAX 14.0
+#define GF_RMAX 48.0 // 128.0
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
+static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
+
+
+void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
+int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+{
+
+ int i;
+ int intra_pred_var = 0;
+ (void) cpi;
+
+ if (use_dc_pred)
+ {
+ x->e_mbd.mbmi.mode = DC_PRED;
+ x->e_mbd.mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+
+ vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ }
+ else
+ {
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &x->e_mbd.block[i];
+ BLOCK *be = &x->block[i];
+
+ vp8_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, be, b, B_DC_PRED);
+ }
+ }
+
+ intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);
+
+ return intra_pred_var;
+}
+
+// Resets the first pass file to the given position using a relative seek from the current position
+static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
+{
+ cpi->stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+ /*FIRSTPASS_STATS * start_pos;
+ int ret_val;
+
+ start_pos = cpi->stats_in;
+ ret_val = vp8_input_stats(cpi, next_frame);
+ reset_fpf_position(cpi, start_pos);
+
+ return ret_val;*/
+
+ if (cpi->stats_in >= cpi->stats_in_end)
+ return EOF;
+
+ *next_frame = *cpi->stats_in;
+ return 1;
+}
+
+// Calculate a modified Error used in distributing bits between easier and harder frames
+static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ double av_err = cpi->total_stats.ssim_weighted_pred_err;
+ double this_err = this_frame->ssim_weighted_pred_err;
+ double modified_err;
+
+ //double relative_next_iiratio;
+ //double next_iiratio;
+ //double sum_iiratio;
+ //int i;
+
+ //FIRSTPASS_STATS next_frame;
+ //FIRSTPASS_STATS *start_pos;
+
+ /*start_pos = cpi->stats_in;
+ sum_iiratio = 0.0;
+ i = 0;
+ while ( (i < 1) && vp8_input_stats(cpi,&next_frame) != EOF )
+ {
+
+ next_iiratio = next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
+ next_iiratio = ( next_iiratio < 1.0 ) ? 1.0 : (next_iiratio > 20.0) ? 20.0 : next_iiratio;
+ sum_iiratio += next_iiratio;
+ i++;
+ }
+ if ( i > 0 )
+ {
+ relative_next_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK(cpi->avg_iiratio * (double)i);
+ }
+ else
+ {
+ relative_next_iiratio = 1.0;
+ }
+ reset_fpf_position(cpi, start_pos);*/
+
+ if (this_err > av_err)
+ modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+ else
+ modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+ /*
+ relative_next_iiratio = pow(relative_next_iiratio,0.25);
+ modified_err = modified_err * relative_next_iiratio;
+ */
+
+ return modified_err;
+}
+
+double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
+{
+ int i, j;
+ int Total = 0;
+
+ unsigned char *src = source->y_buffer;
+ unsigned char value;
+ double sum_weights = 0.0;
+ double Weight;
+
+ // Loop throught the Y plane raw examining levels and creating a weight for the image
+ for (i = 0; i < source->y_height; i++)
+ {
+ for (j = 0; j < source->y_width; j++)
+ {
+ value = src[j];
+
+ if (value >= 64)
+ Weight = 1.0;
+ else if (value > 32)
+ Weight = (value - 32.0f) / 32.0f;
+ else
+ Weight = 0.02;
+
+ sum_weights += Weight;
+ }
+
+ src += source->y_stride;
+ }
+
+ sum_weights /= (source->y_height * source->y_width);
+
+ return sum_weights;
+}
+
+// This function returns the current per frame maximum bitrate target
+int frame_max_bits(VP8_COMP *cpi)
+{
+ // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
+ int max_bits;
+
+ // For CBR we need to also consider buffer fullness.
+ // If we are running below the optimal level then we need to gradually tighten up on max_bits.
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ double buffer_fullness_ratio = (double)DOUBLE_DIVIDE_CHECK(cpi->buffer_level) / (double)cpi->oxcf.optimal_buffer_level;
+
+ // For CBR base this on the target average bits per frame plus the maximum sedction rate passed in by the user
+ max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+ // If our buffer is below the optimum level
+ if (buffer_fullness_ratio < 1.0)
+ {
+ // The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4.
+ int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
+
+ max_bits = (int)(max_bits * buffer_fullness_ratio);
+
+ if (max_bits < min_max_bits)
+ max_bits = min_max_bits; // Lowest value we will set ... which should allow the buffer to refil.
+ }
+ }
+ // VBR
+ else
+ {
+ // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
+ max_bits = (int)(((double)cpi->bits_left / (cpi->total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+ }
+
+ // Trap case where we are out of bits
+ if (max_bits < 0)
+ max_bits = 0;
+
+ return max_bits;
+}
+
+void vp8_output_stats(struct vpx_codec_pkt_list *pktlist,
+ FIRSTPASS_STATS *stats)
+{
+ struct vpx_codec_cx_pkt pkt;
+ pkt.kind = VPX_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(*stats);
+ vpx_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#ifdef OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+ stats->frame,
+ stats->intra_error,
+ stats->coded_error,
+ stats->ssim_weighted_pred_err,
+ stats->pcnt_inter,
+ stats->pcnt_motion,
+ stats->pcnt_second_ref,
+ stats->MVr,
+ stats->mvr_abs,
+ stats->MVc,
+ stats->mvc_abs,
+ stats->MVrv,
+ stats->MVcv,
+ stats->mv_in_out_count,
+ stats->count);
+ fclose(fpfile);
+ }
+#endif
+}
+
+int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+{
+ if (cpi->stats_in >= cpi->stats_in_end)
+ return EOF;
+
+ *fps = *cpi->stats_in++;
+ return 1;
+}
+
+void vp8_zero_stats(FIRSTPASS_STATS *section)
+{
+ section->frame = 0.0;
+ section->intra_error = 0.0;
+ section->coded_error = 0.0;
+ section->ssim_weighted_pred_err = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+}
+void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+ section->frame += frame->frame;
+ section->intra_error += frame->intra_error;
+ section->coded_error += frame->coded_error;
+ section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+void vp8_avg_stats(FIRSTPASS_STATS *section)
+{
+ if (section->count < 1.0)
+ return;
+
+ section->intra_error /= section->count;
+ section->coded_error /= section->count;
+ section->ssim_weighted_pred_err /= section->count;
+ section->pcnt_inter /= section->count;
+ section->pcnt_second_ref /= section->count;
+ section->pcnt_motion /= section->count;
+ section->MVr /= section->count;
+ section->mvr_abs /= section->count;
+ section->MVc /= section->count;
+ section->mvc_abs /= section->count;
+ section->MVrv /= section->count;
+ section->MVcv /= section->count;
+ section->mv_in_out_count /= section->count;
+ section->duration /= section->count;
+}
+
+int vp8_fpmm_get_pos(VP8_COMP *cpi)
+{
+ return ftell(cpi->fp_motion_mapfile);
+}
+void vp8_fpmm_reset_pos(VP8_COMP *cpi, int target_pos)
+{
+ int Offset;
+
+ if (cpi->fp_motion_mapfile)
+ {
+ Offset = ftell(cpi->fp_motion_mapfile) - target_pos;
+ fseek(cpi->fp_motion_mapfile, (int) - Offset, SEEK_CUR);
+ }
+}
+
+void vp8_advance_fpmm(VP8_COMP *cpi, int count)
+{
+#ifdef FIRSTPASS_MM
+ fseek(cpi->fp_motion_mapfile, (int)(count * cpi->common.MBs), SEEK_CUR);
+#endif
+}
+
+void vp8_input_fpmm(VP8_COMP *cpi, int count)
+{
+#ifdef FIRSTPASS_MM
+
+ unsigned char *tmp_motion_map;
+ int i, j;
+
+ if (!cpi->fp_motion_mapfile)
+ return; // Error
+
+ // Create the first pass motion map structure and set to 0
+ CHECK_MEM_ERROR(tmp_motion_map, vpx_calloc(cpi->common.MBs, 1));
+
+ // Reset the state of the global map
+ vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
+
+ // Read the specified number of frame maps and set the global map to the highest value seen for each mb.
+ for (i = 0; i < count; i++)
+ {
+ if (fread(tmp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile) == cpi->common.MBs)
+ {
+ for (j = 0; j < cpi->common.MBs; j++)
+ {
+ if (tmp_motion_map[j] > 1)
+ cpi->fp_motion_map[j] += 5; // Intra is flagged
+ else
+ cpi->fp_motion_map[j] += tmp_motion_map[j];
+ }
+ }
+ else
+ break; // Read error
+
+ }
+
+ if (tmp_motion_map != 0)
+ vpx_free(tmp_motion_map);
+
+#endif
+
+}
+
+void vp8_init_first_pass(VP8_COMP *cpi)
+{
+ vp8_zero_stats(&cpi->total_stats);
+
+#ifdef FIRSTPASS_MM
+ cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "wb");
+#endif
+
+// TEMP debug code
+#ifdef OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "w");
+ fclose(fpfile);
+ }
+#endif
+
+}
+
+void vp8_end_first_pass(VP8_COMP *cpi)
+{
+ vp8_output_stats(cpi->output_pkt_list, &cpi->total_stats);
+
+#ifdef FIRSTPASS_MM
+
+ if (cpi->fp_motion_mapfile)
+ fclose(cpi->fp_motion_mapfile);
+
+#endif
+
+}
+void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
+{
+ MACROBLOCKD * const xd = & x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+
+ unsigned char *src_ptr = (*(b->base_src) + b->src);
+ int src_stride = b->src_stride;
+ unsigned char *ref_ptr;
+ int ref_stride=d->pre_stride;
+
+ // Set up pointers for this macro block recon buffer
+ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+ ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre );
+
+ VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
+}
+
+
+void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
+{
+ MACROBLOCKD *const xd = & x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ int num00;
+
+ MV tmp_mv = {0, 0};
+
+ int tmp_err;
+ int step_param = 3; //3; // Dont search over full range for first pass
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; //3;
+ int n;
+ vp8_variance_fn_ptr_t v_fn_ptr;
+ int new_mv_mode_penalty = 256;
+
+ v_fn_ptr.vf = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16);
+ v_fn_ptr.sdf = cpi->fn_ptr.sdf;
+ v_fn_ptr.sdx4df = cpi->fn_ptr.sdx4df;
+
+ // Set up pointers for this macro block recon buffer
+ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+ // Initial step/diamond search centred on best mv
+ tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+ if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err)
+ {
+ *best_motion_err = tmp_err;
+ best_mv->row = tmp_mv.row;
+ best_mv->col = tmp_mv.col;
+ }
+
+ // Further step/diamond searches as necessary
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+ if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err)
+ {
+ *best_motion_err = tmp_err;
+ best_mv->row = tmp_mv.row;
+ best_mv->col = tmp_mv.col;
+ }
+ }
+ }
+}
+
+void vp8_first_pass(VP8_COMP *cpi)
+{
+ int mb_row, mb_col;
+ MACROBLOCK *const x = & cpi->mb;
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+
+ int col_blocks = 4 * cm->mb_cols;
+ int recon_yoffset, recon_uvoffset;
+ int recon_y_stride = cm->last_frame.y_stride;
+ int recon_uv_stride = cm->last_frame.uv_stride;
+ int intra_error = 0;
+ int coded_error = 0;
+
+ int sum_mvr = 0, sum_mvc = 0;
+ int sum_mvr_abs = 0, sum_mvc_abs = 0;
+ int sum_mvrs = 0, sum_mvcs = 0;
+ int mvcount = 0;
+ int intercount = 0;
+ int second_ref_count = 0;
+ int intrapenalty = 256;
+
+ int sum_in_vectors = 0;
+
+ MV best_ref_mv = {0, 0};
+ MV zero_ref_mv = {0, 0};
+
+ unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
+
+ vp8_clear_system_state(); //__asm emms;
+
+ x->src = * cpi->Source;
+ xd->pre = cm->last_frame;
+ xd->dst = cm->new_frame;
+
+ vp8_build_block_offsets(x);
+
+ vp8_setup_block_dptrs(&x->e_mbd);
+
+ vp8_setup_block_ptrs(x);
+
+ // set up frame new frame for intra coded blocks
+ vp8_setup_intra_recon(&cm->new_frame);
+ vp8cx_frame_init_quantizer(cpi);
+
+ // Initialise the MV cost table to the defaults
+ //if( cm->current_video_frame == 0)
+ //if ( 0 )
+ {
+ int flag[2] = {1, 1};
+ vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+ vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+ }
+
+ // for each macroblock row in image
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ MV best_ref_mv = {0, 0};
+
+ // reset above block coeffs
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int this_error;
+ int gf_motion_error = INT_MAX;
+ int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+ xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ // do intra 16x16 prediction
+ this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+
+ // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
+ // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
+ // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_error += intrapenalty;
+
+ // Cumulative intra error total
+ intra_error += this_error;
+
+ // Indicate default assumption of intra in the motion map
+ *fp_motion_map_ptr = 2;
+
+ // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+ // Other than for the first frame do a motion search
+ if (cm->current_video_frame > 0)
+ {
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ MV tmp_mv = {0, 0};
+ int tmp_err;
+ int motion_error = INT_MAX;
+
+ // Simple 0,0 motion with no mv overhead
+ vp8_zz_motion_search( cpi, x, &cm->last_frame, &motion_error, recon_yoffset );
+ d->bmi.mv.as_mv.row = 0;
+ d->bmi.mv.as_mv.col = 0;
+
+ // Test last reference frame using the previous best mv as the starting point (best reference) for the search
+ vp8_first_pass_motion_search(cpi, x, &best_ref_mv, &d->bmi.mv.as_mv, &cm->last_frame, &motion_error, recon_yoffset);
+
+ // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+ if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0))
+ {
+ tmp_err = INT_MAX;
+ vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, &cm->last_frame, &motion_error, recon_yoffset);
+
+ if ( tmp_err < motion_error )
+ {
+ motion_error = tmp_err;
+ d->bmi.mv.as_mv.row = tmp_mv.row;
+ d->bmi.mv.as_mv.col = tmp_mv.col;
+ }
+
+ }
+
+ // Experimental search in a second reference frame ((0,0) based only)
+ if (cm->current_video_frame > 1)
+ {
+ vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, &cm->golden_frame, &gf_motion_error, recon_yoffset);
+
+ if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
+ {
+ second_ref_count++;
+ //motion_error = gf_motion_error;
+ //d->bmi.mv.as_mv.row = tmp_mv.row;
+ //d->bmi.mv.as_mv.col = tmp_mv.col;
+ }
+ /*else
+ {
+ xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset;
+ }*/
+
+
+ // Reset to last frame as reference buffer
+ xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset;
+ }
+
+ if (motion_error <= this_error)
+ {
+ d->bmi.mv.as_mv.row <<= 3;
+ d->bmi.mv.as_mv.col <<= 3;
+ this_error = motion_error;
+ vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv.as_mv);
+ vp8_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
+ sum_mvr += d->bmi.mv.as_mv.row;
+ sum_mvr_abs += abs(d->bmi.mv.as_mv.row);
+ sum_mvc += d->bmi.mv.as_mv.col;
+ sum_mvc_abs += abs(d->bmi.mv.as_mv.col);
+ sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row;
+ sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
+ intercount++;
+
+ best_ref_mv.row = d->bmi.mv.as_mv.row;
+ best_ref_mv.col = d->bmi.mv.as_mv.col;
+ //best_ref_mv.row = 0;
+ //best_ref_mv.col = 0;
+
+ // Was the vector non-zero
+ if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col)
+ {
+ mvcount++;
+
+ *fp_motion_map_ptr = 1;
+
+ // Does the Row vector point inwards or outwards
+ if (mb_row < cm->mb_rows / 2)
+ {
+ if (d->bmi.mv.as_mv.row > 0)
+ sum_in_vectors--;
+ else if (d->bmi.mv.as_mv.row < 0)
+ sum_in_vectors++;
+ }
+ else if (mb_row > cm->mb_rows / 2)
+ {
+ if (d->bmi.mv.as_mv.row > 0)
+ sum_in_vectors++;
+ else if (d->bmi.mv.as_mv.row < 0)
+ sum_in_vectors--;
+ }
+
+ // Does the Row vector point inwards or outwards
+ if (mb_col < cm->mb_cols / 2)
+ {
+ if (d->bmi.mv.as_mv.col > 0)
+ sum_in_vectors--;
+ else if (d->bmi.mv.as_mv.col < 0)
+ sum_in_vectors++;
+ }
+ else if (mb_col > cm->mb_cols / 2)
+ {
+ if (d->bmi.mv.as_mv.col > 0)
+ sum_in_vectors++;
+ else if (d->bmi.mv.as_mv.col < 0)
+ sum_in_vectors--;
+ }
+ }
+ else
+ *fp_motion_map_ptr = 0; // 0,0 mv was best
+ }
+ else
+ {
+ best_ref_mv.row = 0;
+ best_ref_mv.col = 0;
+ }
+ }
+
+ coded_error += this_error;
+
+ // adjust to the next column of macroblocks
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ // Update the motion map
+ fp_motion_map_ptr++;
+ }
+
+ // adjust to the next row of mbs
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+ //extend the recon for intra prediction
+ vp8_extend_mb_row(&cm->new_frame, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+ vp8_clear_system_state(); //__asm emms;
+ }
+
+ vp8_clear_system_state(); //__asm emms;
+ {
+ double weight = 0.0;
+ double weigth2 = 0.0;
+
+ FIRSTPASS_STATS fps;
+
+ fps.frame = cm->current_video_frame ;
+ fps.intra_error = intra_error >> 8;
+ fps.coded_error = coded_error >> 8;
+ weight = vp8_simple_weight(cpi->Source);
+
+ if (weight < 0.1)
+ weight = 0.1;
+
+ fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+ fps.pcnt_inter = 0.0;
+ fps.pcnt_motion = 0.0;
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.count = 1.0;
+
+ fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs;
+ fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+
+ if (mvcount > 0)
+ {
+ fps.MVr = (double)sum_mvr / (double)mvcount;
+ fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+ fps.MVc = (double)sum_mvc / (double)mvcount;
+ fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+ fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+ fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+ fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+
+ fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+ }
+
+ // TODO: handle the case when duration is set to 0, or something less
+ // than the full time between subsequent cpi->source_time_stamp s .
+ fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
+
+ // don't want to do outputstats with a stack variable!
+ cpi->this_frame_stats = fps;
+ vp8_output_stats(cpi->output_pkt_list, &cpi->this_frame_stats);
+ vp8_accumulate_stats(&cpi->total_stats, &fps);
+
+#ifdef FIRSTPASS_MM
+ fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile);
+#endif
+ }
+
+ // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
+ if ((cm->current_video_frame > 0) &&
+ (cpi->this_frame_stats.pcnt_inter > 0.20) &&
+ ((cpi->this_frame_stats.intra_error / cpi->this_frame_stats.coded_error) > 2.0))
+ {
+ vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+ }
+
+ // swap frame pointers so last frame refers to the frame we just compressed
+ vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+ vp8_yv12_extend_frame_borders(&cm->last_frame);
+
+ // Special case for the first frame. Copy into the GF buffer as a second reference.
+ if (cm->current_video_frame == 0)
+ {
+ vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+ }
+
+
+ // use this to see what the first pass reconstruction looks like
+ if (0)
+ {
+ char filename[512];
+ FILE *recon_file;
+ sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+ if (cm->current_video_frame == 0)
+ recon_file = fopen(filename, "wb");
+ else
+ recon_file = fopen(filename, "ab");
+
+ fwrite(cm->last_frame.buffer_alloc, cm->last_frame.frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+
+ cm->current_video_frame++;
+
+}
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+#define BASE_ERRPERMB 150
+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+{
+ int Q;
+ int num_mbs = ((Height * Width) / (16 * 16));
+ int target_norm_bits_per_mb;
+
+ double err_per_mb = section_err / num_mbs;
+ double correction_factor;
+ double corr_high;
+ double speed_correction = 1.0;
+ double rolling_ratio;
+
+ double pow_highq = 0.90;
+ double pow_lowq = 0.40;
+
+ if (section_target_bandwitdh <= 0)
+ return MAXQ;
+
+ target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+ // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits
+ if ((cpi->rolling_target_bits > 0.0) && (cpi->active_worst_quality < cpi->worst_quality))
+ {
+ //double adjustment_rate = 0.985 + (0.00005 * cpi->active_worst_quality);
+ double adjustment_rate = 0.99;
+
+ rolling_ratio = (double)cpi->rolling_actual_bits / (double)cpi->rolling_target_bits;
+
+ //if ( cpi->est_max_qcorrection_factor > rolling_ratio )
+ if (rolling_ratio < 0.95)
+ //cpi->est_max_qcorrection_factor *= adjustment_rate;
+ cpi->est_max_qcorrection_factor -= 0.005;
+ //else if ( cpi->est_max_qcorrection_factor < rolling_ratio )
+ else if (rolling_ratio > 1.05)
+ cpi->est_max_qcorrection_factor += 0.005;
+
+ //cpi->est_max_qcorrection_factor /= adjustment_rate;
+
+ cpi->est_max_qcorrection_factor = (cpi->est_max_qcorrection_factor < 0.1) ? 0.1 : (cpi->est_max_qcorrection_factor > 10.0) ? 10.0 : cpi->est_max_qcorrection_factor;
+ }
+
+ // Corrections for higher compression speed settings (reduced compression expected)
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ // Correction factor used for Q values >= 20
+ corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+ corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+ // Try and pick a Q that should be high enough to encode the content at the given rate.
+ for (Q = 0; Q < MAXQ; Q++)
+ {
+ int bits_per_mb_at_this_q;
+
+ if (Q < 50)
+ {
+ correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+ correction_factor = (correction_factor < 0.05) ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+ }
+ else
+ correction_factor = corr_high;
+
+ bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * cpi->section_max_qfactor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+ //bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ return Q;
+}
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+{
+ int Q;
+ int num_mbs = ((Height * Width) / (16 * 16));
+ int target_norm_bits_per_mb;
+
+ double err_per_mb = section_err / num_mbs;
+ double correction_factor;
+ double corr_high;
+ double speed_correction = 1.0;
+ double pow_highq = 0.90;
+ double pow_lowq = 0.40;
+
+ target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+ // Corrections for higher compression speed settings (reduced compression expected)
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ // Correction factor used for Q values >= 20
+ corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+ corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+ // Try and pick a Q that can encode the content at the given rate.
+ for (Q = 0; Q < MAXQ; Q++)
+ {
+ int bits_per_mb_at_this_q;
+
+ if (Q < 50)
+ {
+ correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+ correction_factor = (correction_factor < 0.05) ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+ }
+ else
+ correction_factor = corr_high;
+
+ bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ return Q;
+}
+
+// Estimate a worst case Q for a KF group
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)
+{
+ int Q;
+ int num_mbs = ((Height * Width) / (16 * 16));
+ int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
+ int bits_per_mb_at_this_q;
+
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double corr_high;
+ double speed_correction = 1.0;
+ double current_spend_ratio = 1.0;
+
+ double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90;
+ double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80;
+
+ double iiratio_correction_factor = 1.0;
+
+ double combined_correction_factor;
+
+ // Trap special case where the target is <= 0
+ if (target_norm_bits_per_mb <= 0)
+ return MAXQ * 2;
+
+ // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits
+ // This is clamped to the range 0.1 to 10.0
+ if (cpi->long_rolling_target_bits <= 0)
+ current_spend_ratio = 10.0;
+ else
+ {
+ current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits;
+ current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+ }
+
+ // Calculate a correction factor based on the quality of prediction in the sequence as indicated by intra_inter error score ratio (IIRatio)
+ // The idea here is to favour subsampling in the hardest sections vs the easyest.
+ iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1);
+
+ if (iiratio_correction_factor < 0.5)
+ iiratio_correction_factor = 0.5;
+
+ // Corrections for higher compression speed settings (reduced compression expected)
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ // Combine the various factors calculated above
+ combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio;
+
+ // Correction factor used for Q values >= 20
+ corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+ corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+ // Try and pick a Q that should be high enough to encode the content at the given rate.
+ for (Q = 0; Q < MAXQ; Q++)
+ {
+ // Q values < 20 treated as a special case
+ if (Q < 20)
+ {
+ err_correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+ err_correction_factor = (err_correction_factor < 0.05) ? 0.05 : (err_correction_factor > 5.0) ? 5.0 : err_correction_factor;
+ }
+ else
+ err_correction_factor = corr_high;
+
+ bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * combined_correction_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q]);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ // If we could not hit the target even at Max Q then estimate what Q would have bee required
+ while ((bits_per_mb_at_this_q > target_norm_bits_per_mb) && (Q < (MAXQ * 2)))
+ {
+
+ bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q);
+ Q++;
+ }
+
+ if (0)
+ {
+ FILE *f = fopen("estkf_q.stt", "a");
+ fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q,
+ target_norm_bits_per_mb, err_per_mb, err_correction_factor,
+ current_spend_ratio, group_iiratio, iiratio_correction_factor,
+ (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, Q);
+ fclose(f);
+ }
+
+ return Q;
+}
+extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+
+void vp8_init_second_pass(VP8_COMP *cpi)
+{
+ FIRSTPASS_STATS this_frame;
+ FIRSTPASS_STATS *start_pos;
+
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ vp8_zero_stats(&cpi->total_stats);
+
+ if (!cpi->stats_in_end)
+ return;
+
+ cpi->total_stats = *cpi->stats_in_end;
+
+ cpi->total_error_left = cpi->total_stats.ssim_weighted_pred_err;
+ cpi->total_intra_error_left = cpi->total_stats.intra_error;
+ cpi->total_coded_error_left = cpi->total_stats.coded_error;
+ cpi->start_tot_err_left = cpi->total_error_left;
+
+ //cpi->bits_left = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+ //cpi->bits_left -= (long long)(cpi->total_stats.count * two_pass_min_rate / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+
+ // each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However the sum duration is not.
+ // Its calculated based on the actual durations of all frames from the first
+ // pass.
+ vp8_new_frame_rate(cpi, 10000000.0 * cpi->total_stats.count / cpi->total_stats.duration);
+
+ cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->bits_left = (long long)(cpi->total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+ cpi->bits_left -= (long long)(cpi->total_stats.duration * two_pass_min_rate / 10000000.0);
+
+ vp8_avg_stats(&cpi->total_stats);
+
+ // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+ {
+ double sum_iiratio = 0.0;
+ double IIRatio;
+
+ start_pos = cpi->stats_in; // Note starting "file" position
+
+ while (vp8_input_stats(cpi, &this_frame) != EOF)
+ {
+ IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+ IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+ sum_iiratio += IIRatio;
+ }
+
+ cpi->avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->total_stats.count);
+
+ // Reset file position
+ reset_fpf_position(cpi, start_pos);
+ }
+
+ // Scan the first pass file and calculate a modified total error based upon the bias/power function
+ // used to allocate bits
+ {
+ start_pos = cpi->stats_in; // Note starting "file" position
+
+ cpi->modified_total_error_left = 0.0;
+
+ while (vp8_input_stats(cpi, &this_frame) != EOF)
+ {
+ cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame);
+ }
+
+ reset_fpf_position(cpi, start_pos); // Reset file position
+
+ }
+
+#ifdef FIRSTPASS_MM
+ cpi->fp_motion_mapfile = 0;
+ cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "rb");
+#endif
+
+}
+
+void vp8_end_second_pass(VP8_COMP *cpi)
+{
+#ifdef FIRSTPASS_MM
+
+ if (cpi->fp_motion_mapfile)
+ fclose(cpi->fp_motion_mapfile);
+
+#endif
+}
+
+// Analyse and define a gf/arf group .
+static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS *start_pos;
+ int i;
+ int count = 0;
+ int image_size = cpi->common.last_frame.y_width * cpi->common.last_frame.y_height;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double gf_group_err = 0.0;
+ double gf_first_frame_err = 0.0;
+ double mod_frame_err = 0.0;
+
+ double mv_accumulator_rabs = 0.0;
+ double mv_accumulator_cabs = 0.0;
+ double this_mv_rabs;
+ double this_mv_cabs;
+ double mv_ratio_accumulator = 0.0;
+ double distance_factor = 0.0;
+ double decay_accumulator = 1.0;
+
+ double boost_factor = IIFACTOR;
+ double loop_decay_rate = 1.00; // Starting decay rate
+
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ double mod_err_per_mb_accumulator = 0.0;
+
+ int max_bits = frame_max_bits(cpi); // Max for a single frame
+
+#ifdef FIRSTPASS_MM
+ int fpmm_pos;
+#endif
+
+ cpi->gf_group_bits = 0;
+ cpi->gf_decay_rate = 0;
+
+ vp8_clear_system_state(); //__asm emms;
+
+#ifdef FIRSTPASS_MM
+ fpmm_pos = vp8_fpmm_get_pos(cpi);
+#endif
+
+ start_pos = cpi->stats_in;
+
+ // Preload the stats for the next frame.
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+ // Note the error of the frame at the start of the group (this will be the GF frame error if we code a normal gf
+ gf_first_frame_err = mod_frame_err;
+
+ // Special treatment if the current frame is a key frame (which is also a gf).
+ // If it is then its error score (and hence bit allocation) need to be subtracted out
+ // from the calculation for the GF group
+ if (cpi->common.frame_type == KEY_FRAME)
+ gf_group_err -= gf_first_frame_err;
+
+ // Scan forward to try and work out how many frames the next gf group should contain and
+ // what level of boost is appropriate for the GF or ARF that will be coded with the group
+ i = 0;
+
+ while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+ {
+ double r;
+ double motion_factor;
+ double this_frame_mvr_ratio;
+ double this_frame_mvc_ratio;
+
+ i++; // Increment the loop counter
+
+ // Accumulate error score of frames in this gf group
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+ gf_group_err += mod_frame_err;
+
+ mod_err_per_mb_accumulator += mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+
+ if (EOF == vp8_input_stats(cpi, &next_frame))
+ break;
+
+ // Accumulate motion stats.
+ motion_factor = next_frame.pcnt_motion;
+ this_mv_rabs = fabs(next_frame.mvr_abs * motion_factor);
+ this_mv_cabs = fabs(next_frame.mvc_abs * motion_factor);
+
+ mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_factor);
+ mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_factor);
+
+ //Accumulate Motion In/Out of frame stats
+ this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion;
+ mv_in_out_accumulator += next_frame.mv_in_out_count * next_frame.pcnt_motion;
+ abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+
+ // If there is a significant amount of motion
+ if (motion_factor > 0.05)
+ {
+ this_frame_mvr_ratio = fabs(next_frame.mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVr));
+ this_frame_mvc_ratio = fabs(next_frame.mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVc));
+
+ mv_ratio_accumulator += (this_frame_mvr_ratio < next_frame.mvr_abs) ? (this_frame_mvr_ratio * motion_factor) : next_frame.mvr_abs * motion_factor;
+ mv_ratio_accumulator += (this_frame_mvc_ratio < next_frame.mvc_abs) ? (this_frame_mvc_ratio * motion_factor) : next_frame.mvc_abs * motion_factor;
+ }
+ else
+ {
+ mv_ratio_accumulator += 0.0;
+ this_frame_mvr_ratio = 1.0;
+ this_frame_mvc_ratio = 1.0;
+ }
+
+ // Underlying boost factor is based on inter intra error ratio
+ r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));
+
+ // Increase boost for frames where new data coming into frame (eg zoom out)
+ // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in)
+ // The range for this_frame_mv_in_out is -1.0 to +1.0
+ if (this_frame_mv_in_out > 0.0)
+ r += r * (this_frame_mv_in_out * 2.0);
+ else
+ r += r * (this_frame_mv_in_out / 2.0); // In extreme case boost is halved
+
+ if (r > GF_RMAX)
+ r = GF_RMAX;
+
+ // Adjust loop decay rate
+ //if ( next_frame.pcnt_inter < loop_decay_rate )
+ loop_decay_rate = next_frame.pcnt_inter;
+
+ // High % motion -> somewhat higher decay rate
+ if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate)
+ loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0));
+
+ distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + (this_mv_cabs * this_mv_cabs)) / 300.0;
+ distance_factor = ((distance_factor > 1.0) ? 0.0 : (1.0 - distance_factor));
+
+ if (distance_factor < loop_decay_rate)
+ loop_decay_rate = distance_factor;
+
+ // Cumulative effect of decay
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+ decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;
+
+ boost_score += (decay_accumulator * r);
+
+ // Break out conditions.
+ if ( /* i>4 || */
+ (
+ (i > MIN_GF_INTERVAL) && // Dont break out with a very short interval
+ ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) && // Dont break out very close to a key frame
+ ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
+ ((mv_ratio_accumulator > 100.0) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ||
+ ((boost_score - old_boost_score) < 2.0)
+ )
+ )
+ )
+ {
+ boost_score = old_boost_score;
+ break;
+ }
+
+ vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+ old_boost_score = boost_score;
+ }
+
+ cpi->gf_decay_rate = (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+
+ // When using CBR apply additional buffer related upper limits
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ double max_boost;
+
+ // For cbr apply buffer related limits
+ if (cpi->drop_frames_allowed)
+ {
+ int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+
+ if (cpi->buffer_level > df_buffer_level)
+ max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ else
+ max_boost = 0.0;
+ }
+ else if (cpi->buffer_level > 0)
+ {
+ max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ }
+ else
+ {
+ max_boost = 0.0;
+ }
+
+ if (boost_score > max_boost)
+ boost_score = max_boost;
+ }
+
+ cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
+
+ // Should we use the alternate refernce frame
+ if (cpi->oxcf.play_alternate &&
+ (i >= MIN_GF_INTERVAL) &&
+ (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) && // dont use ARF very near next kf
+ (((next_frame.pcnt_inter > 0.75) &&
+ ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) &&
+ //(cpi->gfu_boost>150) &&
+ (cpi->gfu_boost > 100) &&
+ //(cpi->gfu_boost>AF_THRESH2) &&
+ //((cpi->gfu_boost/i)>AF_THRESH) &&
+ //(decay_accumulator > 0.5) &&
+ (cpi->gf_decay_rate <= (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))
+ )
+ )
+ )
+ {
+ int Boost;
+ int allocation_chunks;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int tmp_q;
+ int arf_frame_bits = 0;
+ int group_bits;
+
+ // Estimate the bits to be allocated to the group as a whole
+ if ((cpi->kf_group_bits > 0) && (cpi->kf_group_error_left > 0))
+ group_bits = (int)((double)cpi->kf_group_bits * (gf_group_err / (double)cpi->kf_group_error_left));
+ else
+ group_bits = 0;
+
+ // Boost for arf frame
+ Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+ Boost += (cpi->baseline_gf_interval * 50);
+ allocation_chunks = (i * 100) + Boost;
+
+ // Normalize Altboost and allocations chunck down to prevent overflow
+ while (Boost > 1000)
+ {
+ Boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ // Calculate the number of bits to be spent on the arf based on the boost number
+ arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));
+
+ // Estimate if there are enough bits available to make worthwhile use of an arf.
+ tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);
+
+ // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
+ if (tmp_q < cpi->worst_quality)
+ {
+ cpi->source_alt_ref_pending = TRUE;
+
+ // For alt ref frames the error score for the end frame of the group (the alt ref frame) should not contribute to the group total and hence
+ // the number of bit allocated to the group. Rather it forms part of the next group (it is the GF at the start of the next group)
+ gf_group_err -= mod_frame_err;
+
+ // Set the interval till the next gf or arf. For ARFs this is the number of frames to be coded before the future frame that is coded as an ARF.
+ // The future frame itself is part of the next group
+ cpi->baseline_gf_interval = i - 1;
+
+#ifdef FIRSTPASS_MM
+ // Read through the motion map to load up the entry for the ARF
+ {
+ int j;
+
+ // Advance to the region of interest
+ // Current default 2 frames before to 2 frames after the ARF frame itsef
+ vp8_fpmm_reset_pos(cpi, cpi->fpmm_pos);
+
+ for (j = 0; j < cpi->baseline_gf_interval - 2; j++)
+ vp8_advance_fpmm(cpi, 1);
+
+ // Read / create a motion map for the region of interest
+ vp8_input_fpmm(cpi, 5);
+ }
+#endif
+ }
+ else
+ {
+ cpi->source_alt_ref_pending = FALSE;
+ cpi->baseline_gf_interval = i;
+ }
+ }
+ else
+ {
+ cpi->source_alt_ref_pending = FALSE;
+ cpi->baseline_gf_interval = i;
+ }
+
+ // Conventional GF
+ if (!cpi->source_alt_ref_pending)
+ {
+ // Dont allow conventional gf too near the next kf
+ if ((cpi->frames_to_key - cpi->baseline_gf_interval) < MIN_GF_INTERVAL)
+ {
+ while (cpi->baseline_gf_interval < cpi->frames_to_key)
+ {
+ if (EOF == vp8_input_stats(cpi, this_frame))
+ break;
+
+ cpi->baseline_gf_interval++;
+
+ if (cpi->baseline_gf_interval < cpi->frames_to_key)
+ gf_group_err += calculate_modified_err(cpi, this_frame);
+ }
+ }
+ }
+
+ // Now decide how many bits should be allocated to the GF group as a proportion of those remaining in the kf group.
+ // The final key frame group in the clip is treated as a special case where cpi->kf_group_bits is tied to cpi->bits_left.
+ // This is also important for short clips where there may only be one key frame.
+ if (cpi->frames_to_key >= (int)(cpi->total_stats.count - cpi->common.current_video_frame))
+ {
+ cpi->kf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;
+ }
+
+ // Calculate the bits to be allocated to the group as a whole
+ if ((cpi->kf_group_bits > 0) && (cpi->kf_group_error_left > 0))
+ cpi->gf_group_bits = (int)((double)cpi->kf_group_bits * (gf_group_err / (double)cpi->kf_group_error_left));
+ else
+ cpi->gf_group_bits = 0;
+
+ cpi->gf_group_bits = (cpi->gf_group_bits < 0) ? 0 : (cpi->gf_group_bits > cpi->kf_group_bits) ? cpi->kf_group_bits : cpi->gf_group_bits;
+
+ // Clip cpi->gf_group_bits based on user supplied data rate variability limit (cpi->oxcf.two_pass_vbrmax_section)
+ if (cpi->gf_group_bits > max_bits * cpi->baseline_gf_interval)
+ cpi->gf_group_bits = max_bits * cpi->baseline_gf_interval;
+
+ // Reset the file position
+ reset_fpf_position(cpi, start_pos);
+
+ // Assign bits to the arf or gf.
+ {
+ int Boost;
+ int frames_in_section;
+ int allocation_chunks;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+ // For ARF frames
+ if (cpi->source_alt_ref_pending)
+ {
+ Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+ //Boost += (cpi->baseline_gf_interval * 25);
+ Boost += (cpi->baseline_gf_interval * 50);
+
+ // Set max and minimum boost and hence minimum allocation
+ if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+ Boost = ((cpi->baseline_gf_interval + 1) * 200);
+ else if (Boost < 125)
+ Boost = 125;
+
+ frames_in_section = cpi->baseline_gf_interval + 1;
+ allocation_chunks = (frames_in_section * 100) + Boost;
+ }
+ // Else for standard golden frames
+ else
+ {
+ // boost based on inter / intra ratio of subsequent frames
+ Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100;
+
+ // Set max and minimum boost and hence minimum allocation
+ if (Boost > (cpi->baseline_gf_interval * 150))
+ Boost = (cpi->baseline_gf_interval * 150);
+ else if (Boost < 125)
+ Boost = 125;
+
+ frames_in_section = cpi->baseline_gf_interval;
+ allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+ }
+
+ // Normalize Altboost and allocations chunck down to prevent overflow
+ while (Boost > 1000)
+ {
+ Boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ // Calculate the number of bits to be spent on the gf or arf based on the boost number
+ cpi->gf_bits = (int)((double)Boost * (cpi->gf_group_bits / (double)allocation_chunks));
+
+ // If the frame that is to be boosted is simpler than the average for the gf/arf group then use an alternative calculation
+ // based on the error score of the frame itself
+ if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval)
+ {
+ double alt_gf_grp_bits;
+ int alt_gf_bits;
+
+ alt_gf_grp_bits = ((double)cpi->kf_group_bits * (mod_frame_err * (double)cpi->baseline_gf_interval) / (double)cpi->kf_group_error_left) ;
+ alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits / (double)allocation_chunks));
+
+ if (cpi->gf_bits > alt_gf_bits)
+ {
+ cpi->gf_bits = alt_gf_bits;
+ }
+ }
+ // Else if it is harder than other frames in the group make sure it at least receives an allocation in keeping with
+ // its relative error score, otherwise it may be worse off than an "un-boosted" frame
+ else
+ {
+ int alt_gf_bits = (int)((double)cpi->kf_group_bits * (mod_frame_err / (double)cpi->kf_group_error_left));
+
+ if (alt_gf_bits > cpi->gf_bits)
+ {
+ cpi->gf_bits = alt_gf_bits;
+ }
+ }
+
+ // Apply an additional limit for CBR
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ if (cpi->gf_bits > (cpi->buffer_level >> 1))
+ cpi->gf_bits = cpi->buffer_level >> 1;
+ }
+
+ // Dont allow a negative value for gf_bits
+ if (cpi->gf_bits < 0)
+ cpi->gf_bits = 0;
+
+ // Adjust KF group bits and error remainin
+ cpi->kf_group_error_left -= gf_group_err;
+ cpi->kf_group_bits -= cpi->gf_group_bits;
+
+ if (cpi->kf_group_bits < 0)
+ cpi->kf_group_bits = 0;
+
+ // Note the error score left in the remaining frames of the group.
+ // For normal GFs we want to remove the error score for the first frame of the group (except in Key frame case where this has already happened)
+ if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+ cpi->gf_group_error_left = gf_group_err - gf_first_frame_err;
+ else
+ cpi->gf_group_error_left = gf_group_err;
+
+ cpi->gf_group_bits -= cpi->gf_bits;
+
+ if (cpi->gf_group_bits < 0)
+ cpi->gf_group_bits = 0;
+
+ // Set aside some bits for a mid gf sequence boost
+ if ((cpi->gfu_boost > 150) && (cpi->baseline_gf_interval > 5))
+ {
+ int pct_extra = (cpi->gfu_boost - 100) / 50;
+ pct_extra = (pct_extra > 10) ? 10 : pct_extra;
+
+ cpi->mid_gf_extra_bits = (cpi->gf_group_bits * pct_extra) / 100;
+ cpi->gf_group_bits -= cpi->mid_gf_extra_bits;
+ }
+ else
+ cpi->mid_gf_extra_bits = 0;
+
+ cpi->gf_bits += cpi->min_frame_bandwidth; // Add in minimum for a frame
+ }
+
+ if (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) // Normal GF and not a KF
+ {
+ cpi->per_frame_bandwidth = cpi->gf_bits; // Per frame bit target for this frame
+ }
+
+ // Adjustment to estimate_max_q based on a measure of complexity of the section
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ FIRSTPASS_STATS sectionstats;
+ double Ratio;
+
+ vp8_zero_stats(&sectionstats);
+ reset_fpf_position(cpi, start_pos);
+
+ for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
+ {
+ vp8_input_stats(cpi, &next_frame);
+ vp8_accumulate_stats(&sectionstats, &next_frame);
+ }
+
+ vp8_avg_stats(&sectionstats);
+
+ if (sectionstats.pcnt_motion < .17)
+ cpi->section_is_low_motion = 1;
+ else
+ cpi->section_is_low_motion = 0;
+
+ if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
+ cpi->section_is_fast_motion = 1;
+ else
+ cpi->section_is_fast_motion = 0;
+
+ cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+
+ Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+ //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
+ //{
+ cpi->section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+ if (cpi->section_max_qfactor < 0.80)
+ cpi->section_max_qfactor = 0.80;
+
+ //}
+ //else
+ // cpi->section_max_qfactor = 1.0;
+
+ reset_fpf_position(cpi, start_pos);
+ }
+
+#ifdef FIRSTPASS_MM
+ // Reset the First pass motion map file position
+ vp8_fpmm_reset_pos(cpi, fpmm_pos);
+#endif
+}
+
+// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
+static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ int target_frame_size; // gf_group_error_left
+
+ double modified_err;
+ double err_fraction; // What portion of the remaining GF group error is used by this frame
+
+ int max_bits = frame_max_bits(cpi); // Max for a single frame
+
+ // The final few frames have special treatment
+ if (cpi->frames_till_gf_update_due >= (int)(cpi->total_stats.count - cpi->common.current_video_frame))
+ {
+ cpi->gf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;;
+ }
+
+ // Calculate modified prediction error used in bit allocation
+ modified_err = calculate_modified_err(cpi, this_frame);
+
+ if (cpi->gf_group_error_left > 0)
+ err_fraction = modified_err / cpi->gf_group_error_left; // What portion of the remaining GF group error is used by this frame
+ else
+ err_fraction = 0.0;
+
+ target_frame_size = (int)((double)cpi->gf_group_bits * err_fraction); // How many of those bits available for allocation should we give it?
+
+ // Clip to target size to 0 - max_bits (or cpi->gf_group_bits) at the top end.
+ if (target_frame_size < 0)
+ target_frame_size = 0;
+ else
+ {
+ if (target_frame_size > max_bits)
+ target_frame_size = max_bits;
+
+ if (target_frame_size > cpi->gf_group_bits)
+ target_frame_size = cpi->gf_group_bits;
+ }
+
+ cpi->gf_group_error_left -= modified_err; // Adjust error remaining
+ cpi->gf_group_bits -= target_frame_size; // Adjust bits remaining
+
+ if (cpi->gf_group_bits < 0)
+ cpi->gf_group_bits = 0;
+
+ target_frame_size += cpi->min_frame_bandwidth; // Add in the minimum number of bits that is set aside for every frame.
+
+ // Special case for the frame that lies half way between two gfs
+ if (cpi->common.frames_since_golden == cpi->baseline_gf_interval / 2)
+ target_frame_size += cpi->mid_gf_extra_bits;
+
+ cpi->per_frame_bandwidth = target_frame_size; // Per frame bit target for this frame
+}
+
+void vp8_second_pass(VP8_COMP *cpi)
+{
+ int tmp_q;
+ int frames_left = (int)(cpi->total_stats.count - cpi->common.current_video_frame);
+
+ FIRSTPASS_STATS this_frame;
+ FIRSTPASS_STATS this_frame_copy;
+
+ VP8_COMMON *cm = &cpi->common;
+
+ double this_frame_error;
+ double this_frame_intra_error;
+ double this_frame_coded_error;
+
+ FIRSTPASS_STATS *start_pos;
+
+ if (!cpi->stats_in)
+ {
+ return ;
+ }
+
+ vp8_clear_system_state();
+
+ if (EOF == vp8_input_stats(cpi, &this_frame))
+ return;
+
+#ifdef FIRSTPASS_MM
+ vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
+ cpi->fpmm_pos = vp8_fpmm_get_pos(cpi);
+ vp8_advance_fpmm(cpi, 1); // Read this frame's first pass motion map
+#endif
+
+ this_frame_error = this_frame.ssim_weighted_pred_err;
+ this_frame_intra_error = this_frame.intra_error;
+ this_frame_coded_error = this_frame.coded_error;
+
+ // Store information regarding level of motion etc for use mode decisions.
+ cpi->motion_speed = (int)(fabs(this_frame.MVr) + fabs(this_frame.MVc));
+ cpi->motion_var = (int)(fabs(this_frame.MVrv) + fabs(this_frame.MVcv));
+ cpi->inter_lvl = (int)(this_frame.pcnt_inter * 100);
+ cpi->intra_lvl = (int)((1.0 - this_frame.pcnt_inter) * 100);
+ cpi->motion_lvl = (int)(this_frame.pcnt_motion * 100);
+
+ start_pos = cpi->stats_in;
+
+ // keyframe and section processing !
+ if (cpi->frames_to_key == 0)
+ {
+ // Define next KF group and assign bits to it
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ vp8_find_next_key_frame(cpi, &this_frame_copy);
+
+ // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
+ // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
+ // This is temporary code till we decide what should really happen in this case.
+ if (cpi->oxcf.error_resilient_mode)
+ {
+ cpi->gf_group_bits = cpi->kf_group_bits;
+ cpi->gf_group_error_left = cpi->kf_group_error_left;
+ cpi->baseline_gf_interval = cpi->frames_to_key;
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ cpi->source_alt_ref_pending = FALSE;
+ }
+
+ }
+
+ // Is this a GF / ARF (Note that a KF is always also a GF)
+ if (cpi->frames_till_gf_update_due == 0)
+ {
+ // Define next gf group and assign bits to it
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ define_gf_group(cpi, &this_frame_copy);
+
+ // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
+ // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
+ // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
+ if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))
+ {
+ // Assign a standard frames worth of bits from those allocated to the GF group
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+
+ // If appropriate (we are switching into ARF active but it was not previously active) apply a boost for the gf at the start of the group.
+ //if ( !cpi->source_alt_ref_active && (cpi->gfu_boost > 150) )
+ if (FALSE)
+ {
+ int extra_bits;
+ int pct_extra = (cpi->gfu_boost - 100) / 50;
+
+ pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+ extra_bits = (cpi->gf_group_bits * pct_extra) / 100;
+ cpi->gf_group_bits -= extra_bits;
+ cpi->per_frame_bandwidth += extra_bits;
+ }
+ }
+ }
+
+ // Otherwise this is an ordinary frame
+ else
+ {
+ // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
+ // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
+ // This is temporary code till we decide what should really happen in this case.
+ if (cpi->oxcf.error_resilient_mode)
+ {
+ cpi->frames_till_gf_update_due = cpi->frames_to_key;
+
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ // Assign bits from those allocated to the GF group
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ }
+ }
+ else
+ {
+ // Assign bits from those allocated to the GF group
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ }
+ }
+
+ // Set nominal per second bandwidth for this frame
+ cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
+ if (cpi->target_bandwidth < 0)
+ cpi->target_bandwidth = 0;
+
+ if (cpi->common.current_video_frame == 0)
+ {
+ // guess at 2nd pass q
+ cpi->est_max_qcorrection_factor = 1.0;
+ tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+
+ if (tmp_q < cpi->worst_quality)
+ {
+ cpi->active_worst_quality = tmp_q;
+ cpi->ni_av_qi = tmp_q;
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ cpi->ni_av_qi = cpi->worst_quality;
+ }
+ }
+ else
+ {
+ if (frames_left < 1)
+ frames_left = 1;
+
+ tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+
+ // Move active_worst_quality but in a damped way
+ if (tmp_q > cpi->active_worst_quality)
+ cpi->active_worst_quality ++;
+ else if (tmp_q < cpi->active_worst_quality)
+ cpi->active_worst_quality --;
+
+ cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
+
+ // Clamp to user set limits
+ if (cpi->active_worst_quality > cpi->worst_quality)
+ cpi->active_worst_quality = cpi->worst_quality;
+ else if (cpi->active_worst_quality < cpi->best_quality)
+ cpi->active_worst_quality = cpi->best_quality;
+
+ }
+
+ cpi->frames_to_key --;
+ cpi->total_error_left -= this_frame_error;
+ cpi->total_intra_error_left -= this_frame_intra_error;
+ cpi->total_coded_error_left -= this_frame_coded_error;
+}
+
+
+static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+{
+ BOOL is_viable_kf = FALSE;
+
+ // Does the frame satisfy the primary criteria of a key frame
+ // If so, then examine how well it predicts subsequent frames
+ if ((this_frame->pcnt_second_ref < 0.10) &&
+ (next_frame->pcnt_second_ref < 0.10) &&
+ ((this_frame->pcnt_inter < 0.05) ||
+ (
+ (this_frame->pcnt_inter < .25) &&
+ ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+ ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+ )
+ )
+ )
+ )
+ {
+ int i;
+ FIRSTPASS_STATS *start_pos;
+
+ FIRSTPASS_STATS local_next_frame;
+
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+ double next_iiratio;
+
+ vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+ // Note the starting file position so we can reset to it
+ start_pos = cpi->stats_in;
+
+ // Examine how well the key frame predicts subsequent frames
+ for (i = 0 ; i < 16; i++)
+ {
+ next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ;
+
+ if (next_iiratio > RMAX)
+ next_iiratio = RMAX;
+
+ // Cumulative effect of decay in prediction quality
+ if (local_next_frame.pcnt_inter > 0.85)
+ decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+ else
+ decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+ //decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+
+ // Keep a running total
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses
+ if ((local_next_frame.pcnt_inter < 0.05) ||
+ (next_iiratio < 1.5) ||
+ ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 0.5) ||
+ (local_next_frame.intra_error < 200)
+ )
+ {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ // Get the next frame details
+ if (EOF == vp8_input_stats(cpi, &local_next_frame))
+ break;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
+ if (boost_score > 5.0 && (i > 3))
+ is_viable_kf = TRUE;
+ else
+ {
+ // Reset the file position
+ reset_fpf_position(cpi, start_pos);
+
+ is_viable_kf = FALSE;
+ }
+ }
+
+ return is_viable_kf;
+}
+void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ int i;
+ FIRSTPASS_STATS last_frame;
+ FIRSTPASS_STATS first_frame;
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS *start_position;
+
+ double decay_accumulator = 0;
+ double boost_score = 0;
+ double old_boost_score = 0.0;
+ double loop_decay_rate;
+
+ double kf_mod_err = 0.0;
+ double kf_group_err = 0.0;
+ double kf_group_intra_err = 0.0;
+ double kf_group_coded_err = 0.0;
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ vp8_clear_system_state(); //__asm emms;
+ start_position = cpi->stats_in;
+
+ cpi->common.frame_type = KEY_FRAME;
+
+ // Clear the alt ref active flag as this can never be active on a key frame
+ cpi->source_alt_ref_active = FALSE;
+
+ // Kf is always a gf so clear frames till next gf counter
+ cpi->frames_till_gf_update_due = 0;
+
+ cpi->frames_to_key = 1;
+
+ // Take a copy of the initial frame details
+ vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+ cpi->kf_group_bits = 0; // Estimate of total bits avaialable to kf group
+ cpi->kf_group_error_left = 0; // Group modified error score.
+
+ kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+ // find the next keyframe
+ while (cpi->stats_in < cpi->stats_in_end)
+ {
+ // Accumulate kf group error
+ kf_group_err += calculate_modified_err(cpi, this_frame);
+
+ // These figures keep intra and coded error counts for all frames including key frames in the group.
+ // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ kf_group_intra_err += this_frame->intra_error;
+ kf_group_coded_err += this_frame->coded_error;
+
+ vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+
+ // Provided that we are not at the end of the file...
+ if (EOF != vp8_input_stats(cpi, this_frame))
+ {
+ if (lookup_next_frame_stats(cpi, &next_frame) != EOF)
+ {
+ if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
+ break;
+ }
+ }
+
+ // Step on to the next frame
+ cpi->frames_to_key ++;
+
+ // If we don't have a real key frame within the next two
+ // forcekeyframeevery intervals then break out of the loop.
+ if (cpi->frames_to_key >= 2 *(int)cpi->key_frame_frequency)
+ break;
+
+ }
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural
+ // interval is between 1x and 2x
+ if ( cpi->frames_to_key > (int)cpi->key_frame_frequency )
+ {
+ cpi->frames_to_key /= 2;
+
+ // Estimate corrected kf group error
+ kf_group_err /= 2.0;
+ kf_group_intra_err /= 2.0;
+ kf_group_coded_err /= 2.0;
+ }
+
+ // Special case for the last frame of the file
+ if (cpi->stats_in >= cpi->stats_in_end)
+ {
+ // Accumulate kf group error
+ kf_group_err += calculate_modified_err(cpi, this_frame);
+
+ // These figures keep intra and coded error counts for all frames including key frames in the group.
+ // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ kf_group_intra_err += this_frame->intra_error;
+ kf_group_coded_err += this_frame->coded_error;
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0))
+ {
+ int max_bits = frame_max_bits(cpi); // Max for a single normal frame (not key frame)
+
+ // Default allocation based on bits left and relative complexity of the section
+ cpi->kf_group_bits = (int)(cpi->bits_left * (kf_group_err / cpi->modified_total_error_left));
+
+ // Clip based on maximum per frame rate defined by the user.
+ if (cpi->kf_group_bits > max_bits * cpi->frames_to_key)
+ cpi->kf_group_bits = max_bits * cpi->frames_to_key;
+
+ // Additional special case for CBR if buffer is getting full.
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ // If the buffer is near or above the optimal and this kf group is not being allocated much
+ // then increase the allocation a bit.
+ if (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level)
+ {
+ int high_water_mark = (cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1;
+ int min_group_bits;
+
+ // We are at or above the maximum.
+ if (cpi->buffer_level >= high_water_mark)
+ {
+ min_group_bits = (cpi->av_per_frame_bandwidth * cpi->frames_to_key) + (cpi->buffer_level - high_water_mark);
+
+ if (cpi->kf_group_bits < min_group_bits)
+ cpi->kf_group_bits = min_group_bits;
+ }
+ // We are above optimal but below the maximum
+ else if (cpi->kf_group_bits < (cpi->av_per_frame_bandwidth * cpi->frames_to_key))
+ {
+ int bits_below_av = (cpi->av_per_frame_bandwidth * cpi->frames_to_key) - cpi->kf_group_bits;
+ cpi->kf_group_bits += (int)((double)bits_below_av * (double)(cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
+ (double)(high_water_mark - cpi->oxcf.optimal_buffer_level));
+ }
+ }
+ }
+ }
+ else
+ cpi->kf_group_bits = 0;
+
+ // Reset the first pass file position
+ reset_fpf_position(cpi, start_position);
+
+ // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+ decay_accumulator = 1.0;
+ boost_score = 0.0;
+ loop_decay_rate = 1.00; // Starting decay rate
+
+ for (i = 0 ; i < cpi->frames_to_key ; i++)
+ {
+ double r;
+
+ if (EOF == vp8_input_stats(cpi, &next_frame))
+ break;
+
+ r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)) ;
+
+ if (r > RMAX)
+ r = RMAX;
+
+ // Adjust loop decay rate
+ //if ( next_frame.pcnt_inter < loop_decay_rate )
+ loop_decay_rate = next_frame.pcnt_inter;
+
+ if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate)
+ loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0));
+
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+
+ boost_score += (decay_accumulator * r);
+
+ if ((i > MIN_GF_INTERVAL) &&
+ ((boost_score - old_boost_score) < 1.0))
+ {
+ break;
+ }
+
+ old_boost_score = boost_score;
+ }
+
+ if (1)
+ {
+ FIRSTPASS_STATS sectionstats;
+ double Ratio;
+
+ vp8_zero_stats(&sectionstats);
+ reset_fpf_position(cpi, start_position);
+
+ for (i = 0 ; i < cpi->frames_to_key ; i++)
+ {
+ vp8_input_stats(cpi, &next_frame);
+ vp8_accumulate_stats(&sectionstats, &next_frame);
+ }
+
+ vp8_avg_stats(&sectionstats);
+
+ if (sectionstats.pcnt_motion < .17)
+ cpi->section_is_low_motion = 1;
+ else
+ cpi->section_is_low_motion = 0;
+
+ if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
+ cpi->section_is_fast_motion = 1;
+ else
+ cpi->section_is_fast_motion = 0;
+
+ cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+
+ Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+ // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
+ //{
+ cpi->section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+ if (cpi->section_max_qfactor < 0.80)
+ cpi->section_max_qfactor = 0.80;
+
+ //}
+ //else
+ // cpi->section_max_qfactor = 1.0;
+ }
+
+ // When using CBR apply additional buffer fullness related upper limits
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ double max_boost;
+
+ if (cpi->drop_frames_allowed)
+ {
+ int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+
+ if (cpi->buffer_level > df_buffer_level)
+ max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ else
+ max_boost = 0.0;
+ }
+ else if (cpi->buffer_level > 0)
+ {
+ max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ }
+ else
+ {
+ max_boost = 0.0;
+ }
+
+ if (boost_score > max_boost)
+ boost_score = max_boost;
+ }
+
+ // Reset the first pass file position
+ reset_fpf_position(cpi, start_position);
+
+ // Work out how many bits to allocate for the key frame itself
+ if (1)
+ {
+ int kf_boost = boost_score;
+ int allocation_chunks;
+ int Counter = cpi->frames_to_key;
+ int alt_kf_bits;
+
+ // Min boost based on kf interval
+#if 0
+
+ while ((kf_boost < 48) && (Counter > 0))
+ {
+ Counter -= 2;
+ kf_boost ++;
+ }
+
+#endif
+
+ if (kf_boost < 48)
+ {
+ kf_boost += ((Counter + 1) >> 1);
+
+ if (kf_boost > 48) kf_boost = 48;
+ }
+
+ // bigger frame sizes need larger kf boosts, smaller frames smaller boosts...
+ if ((cpi->common.last_frame.y_width * cpi->common.last_frame.y_height) > (320 * 240))
+ kf_boost += 2 * (cpi->common.last_frame.y_width * cpi->common.last_frame.y_height) / (320 * 240);
+ else if ((cpi->common.last_frame.y_width * cpi->common.last_frame.y_height) < (320 * 240))
+ kf_boost -= 4 * (320 * 240) / (cpi->common.last_frame.y_width * cpi->common.last_frame.y_height);
+
+ kf_boost = (int)((double)kf_boost * 100.0) >> 4; // Scale 16 to 100
+
+ // Adjustment to boost based on recent average q
+ kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+
+ if (kf_boost < 250) // Min KF boost
+ kf_boost = 250;
+
+ // We do three calculations for kf size.
+ // The first is based on the error score for the whole kf group.
+ // The second (optionaly) on the key frames own error if this is smaller than the average for the group.
+ // The final one insures that the frame receives at least the allocation it would have received based on its own error score vs the error score remaining
+
+ allocation_chunks = ((cpi->frames_to_key - 1) * 100) + kf_boost; // cpi->frames_to_key-1 because key frame itself is taken care of by kf_boost
+
+ // Normalize Altboost and allocations chunck down to prevent overflow
+ while (kf_boost > 1000)
+ {
+ kf_boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ cpi->kf_group_bits = (cpi->kf_group_bits < 0) ? 0 : cpi->kf_group_bits;
+
+ // Calculate the number of bits to be spent on the key frame
+ cpi->kf_bits = (int)((double)kf_boost * ((double)cpi->kf_group_bits / (double)allocation_chunks));
+
+ // Apply an additional limit for CBR
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ if (cpi->kf_bits > ((3 * cpi->buffer_level) >> 2))
+ cpi->kf_bits = (3 * cpi->buffer_level) >> 2;
+ }
+
+ // If the key frame is actually easier than the average for the kf group (which does sometimes happen... eg a blank intro frame)
+ // Then use an alternate calculation based on the kf error score which should give a smaller key frame.
+ if (kf_mod_err < kf_group_err / cpi->frames_to_key)
+ {
+ double alt_kf_grp_bits = ((double)cpi->bits_left * (kf_mod_err * (double)cpi->frames_to_key) / cpi->modified_total_error_left) ;
+
+ alt_kf_bits = (int)((double)kf_boost * (alt_kf_grp_bits / (double)allocation_chunks));
+
+ if (cpi->kf_bits > alt_kf_bits)
+ {
+ cpi->kf_bits = alt_kf_bits;
+ }
+ }
+ // Else if it is much harder than other frames in the group make sure it at least receives an allocation in keeping with its relative error score
+ else
+ {
+ alt_kf_bits = (int)((double)cpi->bits_left * (kf_mod_err / cpi->modified_total_error_left));
+
+ if (alt_kf_bits > cpi->kf_bits)
+ {
+ cpi->kf_bits = alt_kf_bits;
+ }
+ }
+
+ cpi->kf_group_bits -= cpi->kf_bits;
+ cpi->kf_bits += cpi->min_frame_bandwidth; // Add in the minimum frame allowance
+
+ cpi->per_frame_bandwidth = cpi->kf_bits; // Peer frame bit target for this frame
+ cpi->target_bandwidth = cpi->kf_bits * cpi->output_frame_rate; // Convert to a per second bitrate
+ }
+
+ // Note the total error score of the kf group minus the key frame itself
+ cpi->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame sizes
+ cpi->modified_total_error_left -= kf_group_err;
+
+ if (cpi->oxcf.allow_spatial_resampling)
+ {
+ int resample_trigger = FALSE;
+ int last_kf_resampled = FALSE;
+ int kf_q;
+ int scale_val = 0;
+ int hr, hs, vr, vs;
+ int new_width = cpi->oxcf.Width;
+ int new_height = cpi->oxcf.Height;
+
+ int projected_buffer_level = cpi->buffer_level;
+ int tmp_q;
+
+ double projected_bits_perframe;
+ double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / (kf_group_coded_err - first_frame.coded_error);
+ double err_per_frame = kf_group_err / cpi->frames_to_key;
+ double bits_per_frame;
+ double av_bits_per_frame;
+ double effective_size_ratio;
+
+ if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
+ last_kf_resampled = TRUE;
+
+ // Set back to unscaled by defaults
+ cpi->common.horiz_scale = NORMAL;
+ cpi->common.vert_scale = NORMAL;
+
+ // Calculate Average bits per frame.
+ //av_bits_per_frame = cpi->bits_left/(double)(cpi->total_stats.count - cpi->common.current_video_frame);
+ av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate);
+ //if ( av_bits_per_frame < 0.0 )
+ // av_bits_per_frame = 0.0
+
+ // CBR... Use the clip average as the target for deciding resample
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ bits_per_frame = av_bits_per_frame;
+ }
+
+ // In VBR we want to avoid downsampling in easy section unless we are under extreme pressure
+ // So use the larger of target bitrate for this sectoion or average bitrate for sequence
+ else
+ {
+ bits_per_frame = cpi->kf_group_bits / cpi->frames_to_key; // This accounts for how hard the section is...
+
+ if (bits_per_frame < av_bits_per_frame) // Dont turn to resampling in easy sections just because they have been assigned a small number of bits
+ bits_per_frame = av_bits_per_frame;
+ }
+
+ // bits_per_frame should comply with our minimum
+ if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100))
+ bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ // Work out if spatial resampling is necessary
+ kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);
+
+ // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
+ projected_bits_perframe = bits_per_frame;
+ tmp_q = kf_q;
+
+ while (tmp_q > cpi->worst_quality)
+ {
+ projected_bits_perframe *= 1.04;
+ tmp_q--;
+ }
+
+ // Guess at buffer level at the end of the section
+ projected_buffer_level = cpi->buffer_level - (int)((projected_bits_perframe - av_bits_per_frame) * cpi->frames_to_key);
+
+ if (0)
+ {
+ FILE *f = fopen("Subsamle.stt", "a");
+ fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale, kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width);
+ fclose(f);
+ }
+
+ // The trigger for spatial resampling depends on the various parameters such as whether we are streaming (CBR) or VBR.
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ // Trigger resample if we are projected to fall below down sample level or
+ // resampled last time and are projected to remain below the up sample level
+ if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) ||
+ (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
+ //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
+ // ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
+ resample_trigger = TRUE;
+ else
+ resample_trigger = FALSE;
+ }
+ else
+ {
+ long long clip_bits = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+ long long over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
+ long long over_spend2 = cpi->oxcf.starting_buffer_level - projected_buffer_level;
+
+ if ((last_kf_resampled && (kf_q > cpi->worst_quality)) || // If triggered last time the threshold for triggering again is reduced
+ ((kf_q > cpi->worst_quality) && // Projected Q higher than allowed and ...
+ (over_spend > clip_bits / 20))) // ... Overspend > 5% of total bits
+ resample_trigger = TRUE;
+ else
+ resample_trigger = FALSE;
+
+ }
+
+ if (resample_trigger)
+ {
+ while ((kf_q >= cpi->worst_quality) && (scale_val < 6))
+ {
+ scale_val ++;
+
+ cpi->common.vert_scale = vscale_lookup[scale_val];
+ cpi->common.horiz_scale = hscale_lookup[scale_val];
+
+ Scale2Ratio(cpi->common.horiz_scale, &hr, &hs);
+ Scale2Ratio(cpi->common.vert_scale, &vr, &vs);
+
+ new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+ new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+ // Reducing the area to 1/4 does not reduce the complexity (err_per_frame) to 1/4...
+ // effective_sizeratio attempts to provide a crude correction for this
+ effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height);
+ effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
+
+ // Now try again and see what Q we get with the smaller image size
+ kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);
+
+ if (0)
+ {
+ FILE *f = fopen("Subsamle.stt", "a");
+ fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, cpi->common.horiz_scale, cpi->common.vert_scale, kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width);
+ fclose(f);
+ }
+ }
+ }
+
+ if ((cpi->common.Width != new_width) || (cpi->common.Height != new_height))
+ {
+ cpi->common.Width = new_width;
+ cpi->common.Height = new_height;
+ vp8_alloc_compressor_data(cpi);
+ }
+ }
+}
diff --git a/vp8/encoder/firstpass.h b/vp8/encoder/firstpass.h
new file mode 100644
index 000000000..d7b52f3f3
--- /dev/null
+++ b/vp8/encoder/firstpass.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#if !defined __INC_FIRSTPASS_H
+#define __INC_FIRSTPASS_H
+
+extern void vp8_init_first_pass(VP8_COMP *cpi);
+extern void vp8_first_pass(VP8_COMP *cpi);
+extern void vp8_end_first_pass(VP8_COMP *cpi);
+
+extern void vp8_init_second_pass(VP8_COMP *cpi);
+extern void vp8_second_pass(VP8_COMP *cpi);
+extern void vp8_end_second_pass(VP8_COMP *cpi);
+
+#endif
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
new file mode 100644
index 000000000..52aab6642
--- /dev/null
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+
+void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
+
+
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp8_cmachine_specific_config(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+ cpi->rtcd.common = &cpi->common.rtcd;
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c;
+
+ cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_c;
+ cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_c;
+ cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_c;
+ cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c;
+ cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c;
+
+ cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c;
+ cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c;
+ cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c;
+ cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_c;
+ cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_c;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_c;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c;
+ cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_c;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_c;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_c;;
+ cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;
+
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_c;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_c;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_c;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;
+
+ cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;
+
+ cpi->rtcd.search.full_search = vp8_full_search_sad;
+ cpi->rtcd.search.diamond_search = vp8_diamond_search_sad;
+#endif
+
+ // Pure C:
+ vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
+
+
+#if ARCH_X86 || ARCH_X86_64
+ vp8_arch_x86_encoder_init(cpi);
+#endif
+
+}
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
new file mode 100644
index 000000000..d80059d37
--- /dev/null
+++ b/vp8/encoder/mcomp.c
@@ -0,0 +1,1467 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+static int mv_bits_sadcost[256];
+
+void vp8cx_init_mv_bits_sadcost()
+{
+ int i;
+
+ for (i = 0; i < 256; i++)
+ {
+ mv_bits_sadcost[i] = (int)sqrt(i * 16);
+ }
+}
+
+
+int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
+{
+ // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
+ // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
+ // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
+ // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
+ return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
+}
+
+int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
+{
+ //int i;
+ //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
+ //return ( (vp8_mv_bit_cost(mv, ref, mvcost, 100) + 128) * error_per_bit) >> 8;
+
+ //i = (vp8_mv_bit_cost(mv, ref, mvcost, 100) * error_per_bit + 128) >> 8;
+ return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8;
+ //return (vp8_mv_bit_cost(mv, ref, mvcost, 128) * error_per_bit + 128) >> 8;
+}
+
+
+static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
+{
+ // get the estimated number of bits for a motion vector, to be used for costing in SAD based
+ // motion estimation
+ return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+ int Len;
+ int search_site_count = 0;
+
+
+ // Generate offsets for 4 search sites per step.
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0)
+ {
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ // Contract.
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+ int Len;
+ int search_site_count = 0;
+
+ // Generate offsets for 8 search sites per step.
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0)
+ {
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride - Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride + Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride - Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride + Len;
+ search_site_count++;
+
+
+ // Contract.
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 8;
+}
+
+
+#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
+#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
+#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+ unsigned char *z = (*(b->base_src) + b->src);
+
+ int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1;
+ int br = bestmv->row << 2, bc = bestmv->col << 2;
+ int tr = br, tc = bc;
+ unsigned int besterr = INT_MAX;
+ unsigned int left, right, up, down, diag;
+ unsigned int sse;
+ unsigned int whichdir;
+ unsigned int halfiters = 4;
+ unsigned int quarteriters = 4;
+
+ int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
+ int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
+ int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1));
+ int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1));
+
+ // central mv
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+
+ // calculate central point error
+ besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
+ besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+ while (--halfiters)
+ {
+ // 1/2 pel
+ CHECK_BETTER(left, tr, tc - 2);
+ CHECK_BETTER(right, tr, tc + 2);
+ CHECK_BETTER(up, tr - 2, tc);
+ CHECK_BETTER(down, tr + 2, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir)
+ {
+ case 0:
+ CHECK_BETTER(diag, tr - 2, tc - 2);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - 2, tc + 2);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + 2, tc - 2);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + 2, tc + 2);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+ // 1/4 pel
+ while (--quarteriters)
+ {
+ CHECK_BETTER(left, tr, tc - 1);
+ CHECK_BETTER(right, tr, tc + 1);
+ CHECK_BETTER(up, tr - 1, tc);
+ CHECK_BETTER(down, tr + 1, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir)
+ {
+ case 0:
+ CHECK_BETTER(diag, tr - 1, tc - 1);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - 1, tc + 1);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + 1, tc - 1);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + 1, tc + 1);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ bestmv->row = br << 1;
+ bestmv->col = bc << 1;
+
+ if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL))
+ return INT_MAX;
+
+ return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+#undef MIN
+#undef MAX
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ int bestmse = INT_MAX;
+ MV startmv;
+ //MV this_mv;
+ MV this_mv;
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+ int whichdir ;
+
+
+ // Trap uncodable vectors
+ if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+ {
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ return INT_MAX;
+ }
+
+ // central mv
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ startmv = *bestmv;
+
+ // calculate central point error
+ bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+ bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ // go left then right and check error
+ this_mv.row = startmv.row;
+ this_mv.col = ((startmv.col - 8) | 4);
+ left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+ left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ }
+
+ this_mv.col += 8;
+ right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+ right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ }
+
+ // go up then down and check error
+ this_mv.col = startmv.col;
+ this_mv.row = ((startmv.row - 8) | 4);
+ up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+ up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ }
+
+ this_mv.row += 8;
+ down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+ down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ }
+
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ // whichdir must be 0-4. Therefore, one of the cases below
+ // must run through. However, because there is no default
+ // and diag is not set elsewhere, we get a compile warning
+ diag = 0;
+ //for(whichdir =0;whichdir<4;whichdir++)
+ //{
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.col += 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row += 4;
+ diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 3:
+ this_mv.col += 4;
+ this_mv.row += 4;
+ diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+// }
+
+
+ // time to check quarter pels.
+ if (bestmv->row < startmv.row)
+ y -= d->pre_stride;
+
+ if (bestmv->col < startmv.col)
+ y--;
+
+ startmv = *bestmv;
+
+
+
+ // go left then right and check error
+ this_mv.row = startmv.row;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col = startmv.col - 2;
+ left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+
+ left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ }
+
+ this_mv.col += 4;
+ right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ }
+
+ // go up then down and check error
+ this_mv.col = startmv.col;
+
+ if (startmv.row & 7)
+ {
+ this_mv.row = startmv.row - 2;
+ up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.row = (startmv.row - 8) | 6;
+ up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+
+ up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ }
+
+ this_mv.row += 4;
+ down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ }
+
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+// for(whichdir=0;whichdir<4;whichdir++)
+// {
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+
+ if (startmv.row & 7)
+ {
+ this_mv.row -= 2;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col -= 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+ }
+ }
+ else
+ {
+ this_mv.row = (startmv.row - 8) | 6;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col -= 2;
+ diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+ }
+ }
+
+ break;
+ case 1:
+ this_mv.col += 2;
+
+ if (startmv.row & 7)
+ {
+ this_mv.row -= 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.row = (startmv.row - 8) | 6;
+ diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+
+ break;
+ case 2:
+ this_mv.row += 2;
+
+ if (startmv.col & 7)
+ {
+ this_mv.col -= 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.col = (startmv.col - 8) | 6;
+ diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+ }
+
+ break;
+ case 3:
+ this_mv.col += 2;
+ this_mv.row += 2;
+ diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+// }
+
+ return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ int bestmse = INT_MAX;
+ MV startmv;
+ //MV this_mv;
+ MV this_mv;
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+
+ // Trap uncodable vectors
+ if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+ {
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ return INT_MAX;
+ }
+
+ // central mv
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ startmv = *bestmv;
+
+ // calculate central point error
+ bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+ bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ // go left then right and check error
+ this_mv.row = startmv.row;
+ this_mv.col = ((startmv.col - 8) | 4);
+ left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+ left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ }
+
+ this_mv.col += 8;
+ right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+ right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ }
+
+ // go up then down and check error
+ this_mv.col = startmv.col;
+ this_mv.row = ((startmv.row - 8) | 4);
+ up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+ up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ }
+
+ this_mv.row += 8;
+ down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+ down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ }
+
+ // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
+#if 0
+ // now check 1 more diagonal -
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.col += 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row += 4;
+ diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ case 3:
+ this_mv.col += 4;
+ this_mv.row += 4;
+ diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+#else
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = (this_mv.row - 8) | 4;
+ diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+ this_mv.col += 8;
+ diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+ this_mv.col = (this_mv.col - 8) | 4;
+ this_mv.row = startmv.row + 4;
+ diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+ this_mv.col += 8;
+ diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+ diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ }
+
+#endif
+ return bestmse;
+}
+
+
+#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
+#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
+
+int vp8_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_t vf,
+ vp8_sad_fn_t sf,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
+ MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+ int i, j;
+ unsigned char *src = (*(b->base_src) + b->src);
+ int src_stride = b->src_stride;
+ int rr = ref_mv->row, rc = ref_mv->col, br = rr, bc = rc, tr, tc;
+ unsigned int besterr, thiserr = 0x7fffffff;
+
+ if (rc < x->mv_col_min) bc = x->mv_col_min;
+
+ if (rc > x->mv_col_max) bc = x->mv_col_max;
+
+ if (rr < x->mv_row_min) br = x->mv_row_min;
+
+ if (rr > x->mv_row_max) br = x->mv_row_max;
+
+ rr >>= 1;
+ rc >>= 1;
+ br >>= 3;
+ bc >>= 3;
+
+ besterr = ERR(br, bc, thiserr);
+
+ // hex search jbb changed to 127 to avoid max 256 problem steping by 2.
+ for (j = 0; j < 127; j++)
+ {
+ tr = br;
+ tc = bc;
+
+ for (i = 0; i < 6; i++)
+ {
+ int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ CHECK_BETTER(thiserr, nr, nc);
+ }
+
+ if (tr == br && tc == bc)
+ break;
+ }
+
+ // check 8 1 away neighbors
+ tr = br;
+ tc = bc;
+
+ for (i = 0; i < 8; i++)
+ {
+ int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+
+ if (nc < x->mv_col_min) continue;
+
+ if (nc > x->mv_col_max) continue;
+
+ if (nr < x->mv_row_min) continue;
+
+ if (nr > x->mv_row_max) continue;
+
+ CHECK_BETTER(thiserr, nr, nc);
+ }
+
+ best_mv->row = br;
+ best_mv->col = bc;
+
+ return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+int vp8_diamond_search_sad
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ MV this_mv;
+
+ int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+ int thissad;
+
+ // Work out the start point for the search
+ in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+ best_address = in_what;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // search_param determines the length of the initial step and hence the number of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ *num00 = 0;
+
+ for (step = 0; step < tot_steps ; step++)
+ {
+ for (j = 0 ; j < x->searches_per_step ; j++)
+ {
+ // Trap illegal vectors
+ this_row_offset = best_mv->row + ss[i].mv.row;
+ this_col_offset = best_mv->col + ss[i].mv.col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+ {
+ check_here = ss[i].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.row = this_row_offset << 3;
+ this_mv.col = this_col_offset << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site)
+ {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ }
+ else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+int vp8_diamond_search_sadx4
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvsadcost[2],
+ int *mvcost[2]
+)
+{
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ MV this_mv;
+
+ unsigned int bestsad = UINT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+ unsigned int thissad;
+
+ // Work out the start point for the search
+ in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+ best_address = in_what;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // search_param determines the length of the initial step and hence the number of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ *num00 = 0;
+
+ for (step = 0; step < tot_steps ; step++)
+ {
+ int check_row_min, check_col_min, check_row_max, check_col_max;
+
+ check_row_min = x->mv_row_min - best_mv->row;
+ check_row_max = x->mv_row_max - best_mv->row;
+ check_col_min = x->mv_col_min - best_mv->col;
+ check_col_max = x->mv_col_max - best_mv->col;
+
+ for (j = 0 ; j < x->searches_per_step ; j += 4)
+ {
+ unsigned char *block_offset[4];
+ unsigned int valid_block[4];
+ int all_in = 1, t;
+
+ for (t = 0; t < 4; t++)
+ {
+ valid_block [t] = (ss[t+i].mv.col > check_col_min);
+ valid_block [t] &= (ss[t+i].mv.col < check_col_max);
+ valid_block [t] &= (ss[t+i].mv.row > check_row_min);
+ valid_block [t] &= (ss[t+i].mv.row < check_row_max);
+
+ all_in &= valid_block[t];
+ block_offset[t] = ss[i+t].offset + best_address;
+ }
+
+ if (all_in)
+ {
+ unsigned int sad_array[4];
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+ for (t = 0; t < 4; t++, i++)
+ {
+ thissad = sad_array[t];
+
+ if (thissad < bestsad)
+ {
+ this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
+ this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ }
+ else
+ {
+ int t;
+
+ for (t = 0; t < 4; i++, t++)
+ {
+ // Trap illegal vectors
+ if (valid_block[t])
+
+ {
+ check_here = block_offset[t];
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_row_offset = best_mv->row + ss[i].mv.row;
+ this_col_offset = best_mv->col + ss[i].mv.col;
+
+ this_mv.row = this_row_offset << 3;
+ this_mv.col = this_col_offset << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site != last_site)
+ {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ }
+ else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+
+#if !(CONFIG_REALTIME_ONLY)
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ MV *best_mv = &d->bmi.mv.as_mv;
+ MV this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ int thissad;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Baseline value at the centre
+
+ //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.row = r << 3;
+ check_here = r * mv_stride + in_what + col_min;
+
+ for (c = col_min; c < col_max; c++)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ this_mv.col = c << 3;
+ //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
+ //thissad += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+
+ check_here++;
+ }
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad < INT_MAX)
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+ else
+ return INT_MAX;
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ MV *best_mv = &d->bmi.mv.as_mv;
+ MV this_mv;
+ unsigned int bestsad = UINT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ unsigned int thissad;
+
+ int ref_row = ref_mv->row >> 3;
+ int ref_col = ref_mv->col >> 3;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ unsigned int sad_array[3];
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+ if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+ (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+ {
+ // Baseline value at the centre
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+ }
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.row = r << 3;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 3) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++)
+ {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.col = c << 3;
+ thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->row = r;
+ best_mv->col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here ++;
+ c ++;
+ }
+
+ }
+
+ this_mv.row = best_mv->row << 3;
+ this_mv.col = best_mv->col << 3;
+
+ if (bestsad < INT_MAX)
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+ + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+ else
+ return INT_MAX;
+}
+#endif
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void)
+{
+ FILE *f = fopen("modecont.c", "w");
+ int i, j;
+
+ fprintf(f, "#include \"entropy.h\"\n");
+ fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+ fprintf(f, "{\n");
+
+ for (j = 0; j < 6; j++)
+ {
+ fprintf(f, " { // %d \n", j);
+ fprintf(f, " ");
+
+ for (i = 0; i < 4; i++)
+ {
+ int overal_prob;
+ int this_prob;
+ int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
+
+ // Overall probs
+ count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+ if (count)
+ overal_prob = 256 * mv_mode_cts[i][0] / count;
+ else
+ overal_prob = 128;
+
+ if (overal_prob == 0)
+ overal_prob = 1;
+
+ // context probs
+ count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+ if (count)
+ this_prob = 256 * mv_ref_ct[j][i][0] / count;
+ else
+ this_prob = 128;
+
+ if (this_prob == 0)
+ this_prob = 1;
+
+ fprintf(f, "%5d, ", this_prob);
+ //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
+ //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "};\n");
+ fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+#ifdef ENTROPY_STATS
+void init_mv_ref_counts()
+{
+ vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+ vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+ if (m == ZEROMV)
+ {
+ ++mv_ref_ct [ct[0]] [0] [0];
+ ++mv_mode_cts[0][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[0]] [0] [1];
+ ++mv_mode_cts[0][1];
+
+ if (m == NEARESTMV)
+ {
+ ++mv_ref_ct [ct[1]] [1] [0];
+ ++mv_mode_cts[1][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[1]] [1] [1];
+ ++mv_mode_cts[1][1];
+
+ if (m == NEARMV)
+ {
+ ++mv_ref_ct [ct[2]] [2] [0];
+ ++mv_mode_cts[2][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[2]] [2] [1];
+ ++mv_mode_cts[2][1];
+
+ if (m == NEWMV)
+ {
+ ++mv_ref_ct [ct[3]] [3] [0];
+ ++mv_mode_cts[3][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[3]] [3] [1];
+ ++mv_mode_cts[3][1];
+ }
+ }
+ }
+ }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
+
+#endif
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
new file mode 100644
index 000000000..921206fec
--- /dev/null
+++ b/vp8/encoder/mcomp.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MCOMP_H
+#define __INC_MCOMP_H
+
+#include "block.h"
+#include "variance.h"
+
+#ifdef ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+#define MAX_MVSEARCH_STEPS 8 // The maximum number of steps in a step search given the largest allowed initial step
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8) // Max full pel mv specified in 1/8 pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units
+
+
+extern void print_mode_context(void);
+extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
+extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
+
+
+extern int vp8_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ MV *ref_mv,
+ MV *best_mv,
+ int search_param,
+ int error_per_bit,
+ int *num00,
+ vp8_variance_fn_t vf,
+ vp8_sad_fn_t sf,
+ int *mvsadcost[2],
+ int *mvcost[2]
+
+);
+
+typedef int (fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]);
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
+extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+
+#define prototype_full_search_sad(sym)\
+ int (sym)\
+ (\
+ MACROBLOCK *x, \
+ BLOCK *b, \
+ BLOCKD *d, \
+ MV *ref_mv, \
+ int error_per_bit, \
+ int distance, \
+ vp8_variance_fn_ptr_t *fn_ptr, \
+ int *mvcost[2], \
+ int *mvsadcost[2] \
+ )
+
+#define prototype_diamond_search_sad(sym)\
+ int (sym)\
+ (\
+ MACROBLOCK *x, \
+ BLOCK *b, \
+ BLOCKD *d, \
+ MV *ref_mv, \
+ MV *best_mv, \
+ int search_param, \
+ int error_per_bit, \
+ int *num00, \
+ vp8_variance_fn_ptr_t *fn_ptr, \
+ int *mvsadcost[2], \
+ int *mvcost[2] \
+ )
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/mcomp_x86.h"
+#endif
+
+typedef prototype_full_search_sad(*vp8_full_search_fn_t);
+extern prototype_full_search_sad(vp8_full_search_sad);
+extern prototype_full_search_sad(vp8_full_search_sadx3);
+
+typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
+extern prototype_diamond_search_sad(vp8_diamond_search_sad);
+extern prototype_diamond_search_sad(vp8_diamond_search_sadx4);
+
+#ifndef vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sad
+#endif
+extern prototype_full_search_sad(vp8_search_full_search);
+
+#ifndef vp8_search_diamond_search
+#define vp8_search_diamond_search vp8_diamond_search_sad
+#endif
+extern prototype_diamond_search_sad(vp8_search_diamond_search);
+
+typedef struct
+{
+ prototype_full_search_sad(*full_search);
+ prototype_diamond_search_sad(*diamond_search);
+} vp8_search_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SEARCH_INVOKE(ctx,fn) vp8_search_##fn
+#endif
+
+#endif
diff --git a/vp8/encoder/modecosts.c b/vp8/encoder/modecosts.c
new file mode 100644
index 000000000..73170cf52
--- /dev/null
+++ b/vp8/encoder/modecosts.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "entropymode.h"
+
+
+void vp8_init_mode_costs(VP8_COMP *c)
+{
+ VP8_COMMON *x = &c->common;
+ {
+ const vp8_tree_p T = vp8_bmode_tree;
+
+ int i = 0;
+
+ do
+ {
+ int j = 0;
+
+ do
+ {
+ vp8_cost_tokens((int *)c->mb.bmode_costs[i][j], x->kf_bmode_prob[i][j], T);
+ }
+ while (++j < VP8_BINTRAMODES);
+ }
+ while (++i < VP8_BINTRAMODES);
+
+ vp8_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
+ }
+ vp8_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.sub_mv_ref_prob, vp8_sub_mv_ref_tree);
+
+ vp8_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree);
+ vp8_cost_tokens(c->mb.mbmode_cost[0], x->kf_ymode_prob, vp8_kf_ymode_tree);
+
+ vp8_cost_tokens(c->mb.intra_uv_mode_cost[1], x->fc.uv_mode_prob, vp8_uv_mode_tree);
+ vp8_cost_tokens(c->mb.intra_uv_mode_cost[0], x->kf_uv_mode_prob, vp8_uv_mode_tree);
+}
diff --git a/vp8/encoder/modecosts.h b/vp8/encoder/modecosts.h
new file mode 100644
index 000000000..5ade26566
--- /dev/null
+++ b/vp8/encoder/modecosts.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECOSTS_H
+#define __INC_MODECOSTS_H
+
+void vp8_init_mode_costs(VP8_COMP *x);
+
+#endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
new file mode 100644
index 000000000..7662720c3
--- /dev/null
+++ b/vp8/encoder/onyx_if.c
@@ -0,0 +1,5428 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "systemdependent.h"
+#include "quantize.h"
+#include "alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "extend.h"
+#include "ratectrl.h"
+#include "quant_common.h"
+#include "segmentation_common.h"
+#include "g_common.h"
+#include "vpx_scale/yv12extend.h"
+#include "postproc.h"
+#include "vpx_mem/vpx_mem.h"
+#include "swapyv12buffer.h"
+#include "threading.h"
+#include "vpx_ports/vpx_timer.h"
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#define RTCD(x) &cpi->common.rtcd.x
+#else
+#define IF_RTCD(x) NULL
+#define RTCD(x) NULL
+#endif
+
+extern void vp8cx_init_mv_bits_sadcost();
+extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
+extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val);
+extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val, int sharpness_lvl);
+extern void vp8_dmachine_specific_config(VP8_COMP *cpi);
+extern void vp8_cmachine_specific_config(VP8_COMP *cpi);
+extern void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi);
+extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
+extern void print_parms(VP8_CONFIG *ocf, char *filenam);
+extern unsigned int vp8_get_processor_freq();
+extern void print_tree_update_probs();
+extern void vp8cx_create_encoder_threads(VP8_COMP *cpi);
+extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+
+
+static void mode_ref_lf_test_function(VP8_COMP *cpi);
+
+extern const int vp8_gf_interval_table[101];
+
+#if CONFIG_PSNR
+#include "math.h"
+
+extern double vp8_calc_ssim
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ int lumamask,
+ double *weight
+);
+
+extern double vp8_calc_ssimg
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *ssim_y,
+ double *ssim_u,
+ double *ssim_v
+);
+
+
+#endif
+
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef ENTROPY_STATS
+extern int intra_mode_stats[10][10][10];
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int tot_pm = 0;
+unsigned int cnt_pm = 0;
+unsigned int tot_ef = 0;
+unsigned int cnt_ef = 0;
+#endif
+
+#ifdef MODE_STATS
+extern unsigned __int64 Sectionbits[50];
+extern int y_modes[5] ;
+extern int uv_modes[4] ;
+extern int b_modes[10] ;
+
+extern int inter_y_modes[10] ;
+extern int inter_uv_modes[4] ;
+extern unsigned int inter_b_modes[15];
+#endif
+
+extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+extern const int qrounding_factors[129];
+extern const int qzbin_factors[129];
+extern void vp8cx_init_quantizer(VP8_COMP *cpi);
+extern const int vp8cx_base_skip_false_prob[128];
+
+
+void vp8_initialize()
+{
+ static int init_done = 0;
+
+ if (!init_done)
+ {
+ vp8_scale_machine_specific_config();
+ vp8_initialize_common();
+ //vp8_dmachine_specific_config();
+ vp8_tokenize_initialize();
+
+ vp8cx_init_mv_bits_sadcost();
+ init_done = 1;
+ }
+}
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void setup_features(VP8_COMP *cpi)
+{
+ // Set up default state for MB feature flags
+ cpi->mb.e_mbd.segmentation_enabled = 0;
+ cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+ vpx_memset(cpi->mb.e_mbd.mb_segment_tree_probs, 255, sizeof(cpi->mb.e_mbd.mb_segment_tree_probs));
+ vpx_memset(cpi->mb.e_mbd.segment_feature_data, 0, sizeof(cpi->mb.e_mbd.segment_feature_data));
+
+ cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+ vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+ // jbb trial !
+ mode_ref_lf_test_function(cpi);
+
+}
+
+
+void vp8_dealloc_compressor_data(VP8_COMP *cpi)
+{
+
+ // Delete sementation map
+ if (cpi->segmentation_map != 0)
+ vpx_free(cpi->segmentation_map);
+
+ cpi->segmentation_map = 0;
+
+ if (cpi->active_map != 0)
+ vpx_free(cpi->active_map);
+
+ cpi->active_map = 0;
+
+ // Delete first pass motion map
+ if (cpi->fp_motion_map != 0)
+ vpx_free(cpi->fp_motion_map);
+
+ cpi->fp_motion_map = 0;
+
+ vp8_de_alloc_frame_buffers(&cpi->common);
+
+ vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
+ vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+#if VP8_TEMPORAL_ALT_REF
+ vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer);
+#endif
+ {
+ int i;
+
+ for (i = 0; i < MAX_LAG_BUFFERS; i++)
+ vp8_yv12_de_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer);
+
+ cpi->source_buffer_count = 0;
+ }
+
+ vpx_free(cpi->tok);
+ cpi->tok = 0;
+
+}
+
+static void enable_segmentation(VP8_PTR ptr)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ // Set the appropriate feature bit
+ cpi->mb.e_mbd.segmentation_enabled = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+static void disable_segmentation(VP8_PTR ptr)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ // Clear the appropriate feature bit
+ cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+// Valid values for a segment are 0 to 3
+// Segmentation map is arrange as [Rows][Columns]
+static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ // Copy in the new segmentation map
+ vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+
+ // Signal that the map should be updated.
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+// The values given for each segment can be either deltas (from the default value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q) , (0-63 for SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q) , (+/-63 for SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given).
+//
+//
+static void set_segment_data(VP8_PTR ptr, signed char *feature_data, unsigned char abs_delta)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+ vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+}
+
+
+static void segmentation_test_function(VP8_PTR ptr)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ unsigned char *seg_map;
+ signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+ int i, j;
+
+ // Create a temporary map for segmentation data.
+ CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
+ // MB loop to set local segmentation map
+ /*for ( i = 0; i < cpi->common.mb_rows; i++ )
+ {
+ for ( j = 0; j < cpi->common.mb_cols; j++ )
+ {
+ //seg_map[(i*cpi->common.mb_cols) + j] = (j % 2) + ((i%2)* 2);
+ //if ( j < cpi->common.mb_cols/2 )
+
+ // Segment 1 around the edge else 0
+ if ( (i == 0) || (j == 0) || (i == (cpi->common.mb_rows-1)) || (j == (cpi->common.mb_cols-1)) )
+ seg_map[(i*cpi->common.mb_cols) + j] = 1;
+ //else if ( (i < 2) || (j < 2) || (i > (cpi->common.mb_rows-3)) || (j > (cpi->common.mb_cols-3)) )
+ // seg_map[(i*cpi->common.mb_cols) + j] = 2;
+ //else if ( (i < 5) || (j < 5) || (i > (cpi->common.mb_rows-6)) || (j > (cpi->common.mb_cols-6)) )
+ // seg_map[(i*cpi->common.mb_cols) + j] = 3;
+ else
+ seg_map[(i*cpi->common.mb_cols) + j] = 0;
+ }
+ }*/
+
+ // Set the segmentation Map
+ set_segmentation_map(ptr, seg_map);
+
+ // Activate segmentation.
+ enable_segmentation(ptr);
+
+ // Set up the quant segment data
+ feature_data[MB_LVL_ALT_Q][0] = 0;
+ feature_data[MB_LVL_ALT_Q][1] = 4;
+ feature_data[MB_LVL_ALT_Q][2] = 0;
+ feature_data[MB_LVL_ALT_Q][3] = 0;
+ // Set up the loop segment data
+ feature_data[MB_LVL_ALT_LF][0] = 0;
+ feature_data[MB_LVL_ALT_LF][1] = 0;
+ feature_data[MB_LVL_ALT_LF][2] = 0;
+ feature_data[MB_LVL_ALT_LF][3] = 0;
+
+ // Initialise the feature data structure
+ // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
+ set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA);
+
+ // Delete sementation map
+ if (seg_map != 0)
+ vpx_free(seg_map);
+
+ seg_map = 0;
+
+}
+
+// A simple function to cyclically refresh the background at a lower Q
+static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
+{
+ unsigned char *seg_map;
+ signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+ int i;
+ int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+ int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
+
+ // Create a temporary map for segmentation data.
+ CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
+ cpi->cyclic_refresh_q = Q;
+
+ for (i = Q; i > 0; i--)
+ {
+ if (vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*(Q + 128)) / 64))
+ //if ( vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*((2*Q)+96))/64) )
+ {
+ break;
+ }
+ }
+
+ cpi->cyclic_refresh_q = i;
+
+ // Only update for inter frames
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ // Cycle through the macro_block rows
+ // MB loop to set local segmentation map
+ for (i = cpi->cyclic_refresh_mode_index; i < mbs_in_frame; i++)
+ {
+ // If the MB is as a candidate for clean up then mark it for possible boost/refresh (segment 1)
+ // The segment id may get reset to 0 later if the MB gets coded anything other than last frame 0,0
+ // as only (last frame 0,0) MBs are eligable for refresh : that is to say Mbs likely to be background blocks.
+ if (cpi->cyclic_refresh_map[i] == 0)
+ {
+ seg_map[i] = 1;
+ }
+ else
+ {
+ seg_map[i] = 0;
+
+ // Skip blocks that have been refreshed recently anyway.
+ if (cpi->cyclic_refresh_map[i] < 0)
+ //cpi->cyclic_refresh_map[i] = cpi->cyclic_refresh_map[i] / 16;
+ cpi->cyclic_refresh_map[i]++;
+ }
+
+
+ if (block_count > 0)
+ block_count--;
+ else
+ break;
+
+ }
+
+ // If we have gone through the frame reset to the start
+ cpi->cyclic_refresh_mode_index = i;
+
+ if (cpi->cyclic_refresh_mode_index >= mbs_in_frame)
+ cpi->cyclic_refresh_mode_index = 0;
+ }
+
+ // Set the segmentation Map
+ set_segmentation_map((VP8_PTR)cpi, seg_map);
+
+ // Activate segmentation.
+ enable_segmentation((VP8_PTR)cpi);
+
+ // Set up the quant segment data
+ feature_data[MB_LVL_ALT_Q][0] = 0;
+ feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q);
+ feature_data[MB_LVL_ALT_Q][2] = 0;
+ feature_data[MB_LVL_ALT_Q][3] = 0;
+
+ // Set up the loop segment data
+ feature_data[MB_LVL_ALT_LF][0] = 0;
+ feature_data[MB_LVL_ALT_LF][1] = lf_adjustment;
+ feature_data[MB_LVL_ALT_LF][2] = 0;
+ feature_data[MB_LVL_ALT_LF][3] = 0;
+
+ // Initialise the feature data structure
+ // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
+ set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+ // Delete sementation map
+ if (seg_map != 0)
+ vpx_free(seg_map);
+
+ seg_map = 0;
+
+}
+
+static void mode_ref_lf_test_function(VP8_COMP *cpi)
+{
+ cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+ vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+ // Test of ref frame deltas
+ cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+ cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+ cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+ cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+ cpi->mb.e_mbd.mode_lf_deltas[0] = 4; // BPRED
+ cpi->mb.e_mbd.mode_lf_deltas[1] = -2; // Zero
+ cpi->mb.e_mbd.mode_lf_deltas[2] = 2; // New mv
+ cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv
+}
+
+void vp8_set_speed_features(VP8_COMP *cpi)
+{
+ SPEED_FEATURES *sf = &cpi->sf;
+ int Mode = cpi->compressor_speed;
+ int Speed = cpi->Speed;
+ int i;
+ VP8_COMMON *cm = &cpi->common;
+
+ // Initialise default mode frequency sampling variables
+ for (i = 0; i < MAX_MODES; i ++)
+ {
+ cpi->mode_check_freq[i] = 0;
+ cpi->mode_test_hit_counts[i] = 0;
+ cpi->mode_chosen_counts[i] = 0;
+ }
+
+ cpi->mbs_tested_so_far = 0;
+
+ // best quality
+ sf->RD = 1;
+ sf->search_method = NSTEP;
+ sf->improved_quant = 1;
+ sf->improved_dct = 1;
+ sf->auto_filter = 1;
+ sf->recode_loop = 1;
+ sf->quarter_pixel_search = 1;
+ sf->half_pixel_search = 1;
+ sf->full_freq[0] = 7;
+ sf->full_freq[1] = 7;
+ sf->min_fs_radius = 8;
+ sf->max_fs_radius = 32;
+ sf->iterative_sub_pixel = 1;
+ sf->optimize_coefficients = 1;
+
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+
+ cpi->do_full[0] = 0;
+ cpi->do_full[1] = 0;
+
+ // default thresholds to 0
+ for (i = 0; i < MAX_MODES; i++)
+ sf->thresh_mult[i] = 0;
+
+ switch (Mode)
+ {
+#if !(CONFIG_REALTIME_ONLY)
+ case 0: // best quality mode
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_ZEROG ] = 0;
+ sf->thresh_mult[THR_ZEROA ] = 0;
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_NEARESTG ] = 0;
+ sf->thresh_mult[THR_NEARESTA ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEARG ] = 0;
+ sf->thresh_mult[THR_NEARA ] = 0;
+
+ sf->thresh_mult[THR_DC ] = 0;
+
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2000;
+ sf->thresh_mult[THR_TM ] = 1000;
+
+ sf->thresh_mult[THR_NEWMV ] = 1000;
+ sf->thresh_mult[THR_NEWG ] = 1000;
+ sf->thresh_mult[THR_NEWA ] = 1000;
+
+ sf->thresh_mult[THR_SPLITMV ] = 2500;
+ sf->thresh_mult[THR_SPLITG ] = 5000;
+ sf->thresh_mult[THR_SPLITA ] = 5000;
+
+ sf->full_freq[0] = 7;
+ sf->full_freq[1] = 15;
+
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+
+ if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+ {
+ sf->thresh_mult[THR_NEWMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+ sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARMV ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ }
+
+ if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+ {
+ sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROG ] = INT_MAX;
+ sf->thresh_mult[THR_NEARG ] = INT_MAX;
+ sf->thresh_mult[THR_NEWG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ }
+ else if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+ {
+ sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROA ] = INT_MAX;
+ sf->thresh_mult[THR_NEARA ] = INT_MAX;
+ sf->thresh_mult[THR_NEWA ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+ }
+
+ break;
+ case 1:
+ case 3:
+ sf->optimize_coefficients = 0;
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_DC ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2500;
+ sf->thresh_mult[THR_TM ] = 1000;
+
+ sf->thresh_mult[THR_NEARESTG ] = 1000;
+ sf->thresh_mult[THR_NEARESTA ] = 1000;
+
+ sf->thresh_mult[THR_ZEROG ] = 1000;
+ sf->thresh_mult[THR_ZEROA ] = 1000;
+ sf->thresh_mult[THR_NEARG ] = 1000;
+ sf->thresh_mult[THR_NEARA ] = 1000;
+
+ sf->thresh_mult[THR_NEWMV ] = 1500;
+ sf->thresh_mult[THR_NEWG ] = 1500;
+ sf->thresh_mult[THR_NEWA ] = 1500;
+
+ sf->thresh_mult[THR_SPLITMV ] = 5000;
+ sf->thresh_mult[THR_SPLITG ] = 10000;
+ sf->thresh_mult[THR_SPLITA ] = 10000;
+
+ sf->full_freq[0] = 15;
+ sf->full_freq[1] = 31;
+
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+
+ if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+ {
+ sf->thresh_mult[THR_NEWMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+ sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARMV ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ }
+ else if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+ {
+ sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROG ] = INT_MAX;
+ sf->thresh_mult[THR_NEARG ] = INT_MAX;
+ sf->thresh_mult[THR_NEWG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ }
+ else if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+ {
+ sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROA ] = INT_MAX;
+ sf->thresh_mult[THR_NEARA ] = INT_MAX;
+ sf->thresh_mult[THR_NEWA ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+ }
+
+ if (Speed > 0)
+ {
+ cpi->mode_check_freq[THR_SPLITG] = 4;
+ cpi->mode_check_freq[THR_SPLITA] = 4;
+ cpi->mode_check_freq[THR_SPLITMV] = 2;
+
+ sf->thresh_mult[THR_TM ] = 1500;
+ sf->thresh_mult[THR_V_PRED ] = 1500;
+ sf->thresh_mult[THR_H_PRED ] = 1500;
+ sf->thresh_mult[THR_B_PRED ] = 5000;
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+ sf->thresh_mult[THR_SPLITMV ] = 10000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = 1500;
+ sf->thresh_mult[THR_ZEROG ] = 1500;
+ sf->thresh_mult[THR_NEARG ] = 1500;
+ sf->thresh_mult[THR_NEWG ] = 2000;
+ sf->thresh_mult[THR_SPLITG ] = 20000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = 1500;
+ sf->thresh_mult[THR_ZEROA ] = 1500;
+ sf->thresh_mult[THR_NEARA ] = 1500;
+ sf->thresh_mult[THR_NEWA ] = 2000;
+ sf->thresh_mult[THR_SPLITA ] = 20000;
+ }
+
+ sf->improved_quant = 0;
+ sf->improved_dct = 0;
+
+ sf->first_step = 1;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+ }
+
+ if (Speed > 1)
+ {
+ cpi->mode_check_freq[THR_SPLITG] = 15;
+ cpi->mode_check_freq[THR_SPLITA] = 15;
+ cpi->mode_check_freq[THR_SPLITMV] = 7;
+
+ sf->thresh_mult[THR_TM ] = 2000;
+ sf->thresh_mult[THR_V_PRED ] = 2000;
+ sf->thresh_mult[THR_H_PRED ] = 2000;
+ sf->thresh_mult[THR_B_PRED ] = 7500;
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+ sf->thresh_mult[THR_SPLITMV ] = 25000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = 2000;
+ sf->thresh_mult[THR_ZEROG ] = 2000;
+ sf->thresh_mult[THR_NEARG ] = 2000;
+ sf->thresh_mult[THR_NEWG ] = 2500;
+ sf->thresh_mult[THR_SPLITG ] = 50000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = 2000;
+ sf->thresh_mult[THR_ZEROA ] = 2000;
+ sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEWA ] = 2500;
+ sf->thresh_mult[THR_SPLITA ] = 50000;
+ }
+
+ // Only do recode loop on key frames and golden frames
+ sf->recode_loop = 2;
+
+ sf->full_freq[0] = 31;
+ sf->full_freq[1] = 63;
+
+ }
+
+ if (Speed > 2)
+ {
+ sf->auto_filter = 0; // Faster selection of loop filter
+ cpi->mode_check_freq[THR_V_PRED] = 2;
+ cpi->mode_check_freq[THR_H_PRED] = 2;
+ cpi->mode_check_freq[THR_B_PRED] = 2;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARG] = 2;
+ cpi->mode_check_freq[THR_NEWG] = 4;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARA] = 2;
+ cpi->mode_check_freq[THR_NEWA] = 4;
+ }
+
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+
+ sf->full_freq[0] = 63;
+ sf->full_freq[1] = 127;
+ }
+
+ if (Speed > 3)
+ {
+ cpi->mode_check_freq[THR_V_PRED] = 0;
+ cpi->mode_check_freq[THR_H_PRED] = 0;
+ cpi->mode_check_freq[THR_B_PRED] = 0;
+ cpi->mode_check_freq[THR_NEARG] = 0;
+ cpi->mode_check_freq[THR_NEWG] = 0;
+ cpi->mode_check_freq[THR_NEARA] = 0;
+ cpi->mode_check_freq[THR_NEWA] = 0;
+
+ sf->auto_filter = 1;
+ sf->recode_loop = 0; // recode loop off
+ sf->RD = 0; // Turn rd off
+ sf->full_freq[0] = INT_MAX;
+ sf->full_freq[1] = INT_MAX;
+ }
+
+ if (Speed > 4)
+ {
+ sf->auto_filter = 0; // Faster selection of loop filter
+
+ cpi->mode_check_freq[THR_V_PRED] = 2;
+ cpi->mode_check_freq[THR_H_PRED] = 2;
+ cpi->mode_check_freq[THR_B_PRED] = 2;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARG] = 2;
+ cpi->mode_check_freq[THR_NEWG] = 4;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARA] = 2;
+ cpi->mode_check_freq[THR_NEWA] = 4;
+ }
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = 2000;
+ sf->thresh_mult[THR_ZEROG ] = 2000;
+ sf->thresh_mult[THR_NEARG ] = 2000;
+ sf->thresh_mult[THR_NEWG ] = 4000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = 2000;
+ sf->thresh_mult[THR_ZEROA ] = 2000;
+ sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEWA ] = 4000;
+ }
+ }
+
+ break;
+#endif
+ case 2:
+ sf->optimize_coefficients = 0;
+ sf->recode_loop = 0;
+ sf->auto_filter = 1;
+ sf->iterative_sub_pixel = 1;
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_DC ] = 0;
+ sf->thresh_mult[THR_TM ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2500;
+ sf->thresh_mult[THR_NEARESTG ] = 1000;
+ sf->thresh_mult[THR_ZEROG ] = 1000;
+ sf->thresh_mult[THR_NEARG ] = 1000;
+ sf->thresh_mult[THR_NEARESTA ] = 1000;
+ sf->thresh_mult[THR_ZEROA ] = 1000;
+ sf->thresh_mult[THR_NEARA ] = 1000;
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+ sf->thresh_mult[THR_NEWG ] = 2000;
+ sf->thresh_mult[THR_NEWA ] = 2000;
+ sf->thresh_mult[THR_SPLITMV ] = 5000;
+ sf->thresh_mult[THR_SPLITG ] = 10000;
+ sf->thresh_mult[THR_SPLITA ] = 10000;
+ sf->full_freq[0] = 15;
+ sf->full_freq[1] = 31;
+ sf->search_method = NSTEP;
+
+ if (!cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ sf->thresh_mult[THR_NEWMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+ sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARMV ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ }
+
+ if (!cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROG ] = INT_MAX;
+ sf->thresh_mult[THR_NEARG ] = INT_MAX;
+ sf->thresh_mult[THR_NEWG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ }
+
+ if (!cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROA ] = INT_MAX;
+ sf->thresh_mult[THR_NEARA ] = INT_MAX;
+ sf->thresh_mult[THR_NEWA ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+ }
+
+ if (Speed > 0)
+ {
+ cpi->mode_check_freq[THR_SPLITG] = 4;
+ cpi->mode_check_freq[THR_SPLITA] = 4;
+ cpi->mode_check_freq[THR_SPLITMV] = 2;
+
+ sf->thresh_mult[THR_DC ] = 0;
+ sf->thresh_mult[THR_TM ] = 1000;
+ sf->thresh_mult[THR_V_PRED ] = 2000;
+ sf->thresh_mult[THR_H_PRED ] = 2000;
+ sf->thresh_mult[THR_B_PRED ] = 5000;
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+ sf->thresh_mult[THR_SPLITMV ] = 10000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = 1000;
+ sf->thresh_mult[THR_ZEROG ] = 1000;
+ sf->thresh_mult[THR_NEARG ] = 1000;
+ sf->thresh_mult[THR_NEWG ] = 2000;
+ sf->thresh_mult[THR_SPLITG ] = 20000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = 1000;
+ sf->thresh_mult[THR_ZEROA ] = 1000;
+ sf->thresh_mult[THR_NEARA ] = 1000;
+ sf->thresh_mult[THR_NEWA ] = 2000;
+ sf->thresh_mult[THR_SPLITA ] = 20000;
+ }
+
+ sf->improved_quant = 0;
+ sf->improved_dct = 0;
+ }
+
+ if (Speed > 1)
+ {
+ cpi->mode_check_freq[THR_SPLITMV] = 7;
+ cpi->mode_check_freq[THR_SPLITG] = 15;
+ cpi->mode_check_freq[THR_SPLITA] = 15;
+
+ sf->thresh_mult[THR_TM ] = 2000;
+ sf->thresh_mult[THR_V_PRED ] = 2000;
+ sf->thresh_mult[THR_H_PRED ] = 2000;
+ sf->thresh_mult[THR_B_PRED ] = 5000;
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+ sf->thresh_mult[THR_SPLITMV ] = 25000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = 2000;
+ sf->thresh_mult[THR_ZEROG ] = 2000;
+ sf->thresh_mult[THR_NEARG ] = 2000;
+ sf->thresh_mult[THR_NEWG ] = 2500;
+ sf->thresh_mult[THR_SPLITG ] = 50000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = 2000;
+ sf->thresh_mult[THR_ZEROA ] = 2000;
+ sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEWA ] = 2500;
+ sf->thresh_mult[THR_SPLITA ] = 50000;
+ }
+
+ sf->full_freq[0] = 31;
+ sf->full_freq[1] = 63;
+ }
+
+ if (Speed > 2)
+ {
+ sf->auto_filter = 0; // Faster selection of loop filter
+
+ cpi->mode_check_freq[THR_V_PRED] = 2;
+ cpi->mode_check_freq[THR_H_PRED] = 2;
+ cpi->mode_check_freq[THR_B_PRED] = 2;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARG] = 2;
+ cpi->mode_check_freq[THR_NEWG] = 4;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARA] = 2;
+ cpi->mode_check_freq[THR_NEWA] = 4;
+ }
+
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+
+ sf->full_freq[0] = 63;
+ sf->full_freq[1] = 127;
+ }
+
+ if (Speed > 3)
+ {
+ sf->RD = 0;
+ sf->full_freq[0] = INT_MAX;
+ sf->full_freq[1] = INT_MAX;
+
+ sf->auto_filter = 1;
+ }
+
+ if (Speed > 4)
+ {
+ sf->auto_filter = 0; // Faster selection of loop filter
+
+#if CONFIG_REALTIME_ONLY
+ sf->search_method = HEX;
+#else
+ sf->search_method = DIAMOND;
+#endif
+
+ cpi->mode_check_freq[THR_V_PRED] = 4;
+ cpi->mode_check_freq[THR_H_PRED] = 4;
+ cpi->mode_check_freq[THR_B_PRED] = 4;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARG] = 2;
+ cpi->mode_check_freq[THR_NEWG] = 4;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ cpi->mode_check_freq[THR_NEARA] = 2;
+ cpi->mode_check_freq[THR_NEWA] = 4;
+ }
+
+ sf->thresh_mult[THR_TM ] = 2000;
+ sf->thresh_mult[THR_B_PRED ] = 5000;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTG ] = 2000;
+ sf->thresh_mult[THR_ZEROG ] = 2000;
+ sf->thresh_mult[THR_NEARG ] = 2000;
+ sf->thresh_mult[THR_NEWG ] = 4000;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEARESTA ] = 2000;
+ sf->thresh_mult[THR_ZEROA ] = 2000;
+ sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEWA ] = 4000;
+ }
+ }
+
+ if (Speed > 5)
+ {
+ // Disable split MB intra prediction mode
+ sf->thresh_mult[THR_B_PRED] = INT_MAX;
+ }
+
+ if (Speed > 6)
+ {
+ unsigned int i, sum = 0;
+ unsigned int total_mbs = cm->MBs;
+ int thresh;
+ int total_skip;
+
+ int min = 2000;
+ sf->iterative_sub_pixel = 0;
+
+ if (cpi->oxcf.encode_breakout > 2000)
+ min = cpi->oxcf.encode_breakout;
+
+ min >>= 7;
+
+ for (i = 0; i < min; i++)
+ {
+ sum += cpi->error_bins[i];
+ }
+
+ total_skip = sum;
+ sum = 0;
+
+ // i starts from 2 to make sure thresh started from 2048
+ for (; i < 1024; i++)
+ {
+ sum += cpi->error_bins[i];
+
+ if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
+ break;
+ }
+
+ i--;
+ thresh = (i << 7);
+
+ if (thresh < 2000)
+ thresh = 2000;
+
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ sf->thresh_mult[THR_NEWMV] = thresh;
+ sf->thresh_mult[THR_NEARESTMV ] = thresh >> 1;
+ sf->thresh_mult[THR_NEARMV ] = thresh >> 1;
+ }
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ sf->thresh_mult[THR_NEWG] = thresh << 1;
+ sf->thresh_mult[THR_NEARESTG ] = thresh;
+ sf->thresh_mult[THR_NEARG ] = thresh;
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ sf->thresh_mult[THR_NEWA] = thresh << 1;
+ sf->thresh_mult[THR_NEARESTA ] = thresh;
+ sf->thresh_mult[THR_NEARA ] = thresh;
+ }
+
+ // Disable other intra prediction modes
+ sf->thresh_mult[THR_TM] = INT_MAX;
+ sf->thresh_mult[THR_V_PRED] = INT_MAX;
+ sf->thresh_mult[THR_H_PRED] = INT_MAX;
+
+ }
+
+ if (Speed > 8)
+ {
+ sf->quarter_pixel_search = 0;
+ }
+
+ if (Speed > 9)
+ {
+ int Tmp = cpi->Speed - 8;
+
+ if (Tmp > 4)
+ Tmp = 4;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ cpi->mode_check_freq[THR_ZEROG] = 1 << (Tmp - 1);
+ cpi->mode_check_freq[THR_NEARESTG] = 1 << (Tmp - 1);
+ cpi->mode_check_freq[THR_NEARG] = 1 << Tmp;
+ cpi->mode_check_freq[THR_NEWG] = 1 << (Tmp + 1);
+ }
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+ {
+ cpi->mode_check_freq[THR_ZEROA] = 1 << (Tmp - 1);
+ cpi->mode_check_freq[THR_NEARESTA] = 1 << (Tmp - 1);
+ cpi->mode_check_freq[THR_NEARA] = 1 << Tmp;
+ cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1);
+ }
+
+ cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1);
+ }
+
+ cm->filter_type = NORMAL_LOOPFILTER;
+
+ if (Speed >= 14)
+ cm->filter_type = SIMPLE_LOOPFILTER;
+
+ if (Speed >= 15)
+ {
+ sf->half_pixel_search = 0; // This has a big hit on quality. Last resort
+ }
+
+ vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
+
+ };
+
+ if (cpi->sf.search_method == NSTEP)
+ {
+ vp8_init3smotion_compensation(&cpi->mb, cm->last_frame.y_stride);
+ }
+ else if (cpi->sf.search_method == DIAMOND)
+ {
+ vp8_init_dsmotion_compensation(&cpi->mb, cm->last_frame.y_stride);
+ }
+
+ if (cpi->sf.improved_dct)
+ {
+ cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
+ cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
+ cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
+ cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
+ }
+ else
+ {
+ cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
+ cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
+ cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
+ cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
+ }
+
+ cpi->mb.vp8_short_fdct4x4_ptr = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
+ cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
+
+ if (cpi->sf.improved_quant)
+ {
+ cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+ cpi->mb.quantize_brd = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+ }
+ else
+ {
+ cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+ cpi->mb.quantize_brd = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+ }
+
+#if CONFIG_RUNTIME_CPU_DETECT
+ cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
+#endif
+
+ if (cpi->sf.iterative_sub_pixel == 1)
+ {
+ cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively;
+ }
+ else if (cpi->sf.quarter_pixel_search)
+ {
+ cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step;
+ }
+ else if (cpi->sf.half_pixel_search)
+ {
+ cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step;
+ }
+ else
+ {
+ cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+ }
+
+ if (cpi->sf.optimize_coefficients == 1)
+ cpi->mb.optimize = 1;
+ else
+ cpi->mb.optimize = 0;
+
+ if (cpi->common.full_pixel)
+ cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+
+#ifdef SPEEDSTATS
+ frames_at_speed[cpi->Speed]++;
+#endif
+}
+static void alloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+ int i, buffers;
+
+ buffers = cpi->oxcf.lag_in_frames;
+
+ if (buffers > MAX_LAG_BUFFERS)
+ buffers = MAX_LAG_BUFFERS;
+
+ if (buffers < 1)
+ buffers = 1;
+
+ for (i = 0; i < buffers; i++)
+ if (vp8_yv12_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer,
+ cpi->oxcf.Width, cpi->oxcf.Height,
+ 16))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffer");
+
+#if VP8_TEMPORAL_ALT_REF
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer,
+ cpi->oxcf.Width, cpi->oxcf.Height, 16))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate altref buffer");
+
+#endif
+
+ cpi->source_buffer_count = 0;
+}
+void vp8_alloc_compressor_data(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = & cpi->common;
+
+ int width = cm->Width;
+ int height = cm->Height;
+
+ if (vp8_alloc_frame_buffers(cm, width, height))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffers");
+
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0)
+ height += 16 - (height & 0xf);
+
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, 16))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled source buffer");
+
+
+ if (cpi->tok != 0)
+ vpx_free(cpi->tok);
+
+ {
+ unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+
+ CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ }
+
+ // Data used for real time vc mode to see if gf needs refreshing
+ cpi->inter_zz_count = 0;
+ cpi->gf_bad_count = 0;
+ cpi->gf_update_recommended = 0;
+}
+
+
+// Quant MOD
+static const int q_trans[] =
+{
+ 0, 1, 2, 3, 4, 5, 7, 8,
+ 9, 10, 12, 13, 15, 17, 18, 19,
+ 20, 21, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31, 33, 35, 37, 39, 41,
+ 43, 45, 47, 49, 51, 53, 55, 57,
+ 59, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103,
+ 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+int vp8_reverse_trans(int x)
+{
+ int i;
+
+ for (i = 0; i < 64; i++)
+ if (q_trans[i] >= x)
+ return i;
+
+ return 63;
+};
+void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+{
+ cpi->oxcf.frame_rate = framerate;
+ cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+ cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+ cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+ cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
+
+ cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
+ cpi->max_gf_interval = (int)(cpi->output_frame_rate / 2) + 2;
+
+ //cpi->max_gf_interval = (int)(cpi->output_frame_rate * 2 / 3) + 1;
+ //cpi->max_gf_interval = 24;
+
+ if (cpi->max_gf_interval < 12)
+ cpi->max_gf_interval = 12;
+
+
+ // Special conditions when altr ref frame enabled
+ if (cpi->oxcf.play_alternate)
+ {
+ if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+ cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+ }
+}
+
+void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+ VP8_COMMON *cm = &cpi->common;
+
+ if (!cpi)
+ return;
+
+ cpi->auto_gold = 1;
+ cpi->auto_adjust_gold_quantizer = 1;
+ cpi->goldquantizer = 1;
+ cpi->goldfreq = 7;
+ cpi->auto_adjust_key_quantizer = 1;
+ cpi->keyquantizer = 1;
+
+ cm->version = oxcf->Version;
+ vp8_setup_version(cm);
+
+ if (oxcf == 0)
+ {
+ cpi->pass = 0;
+
+ cpi->auto_worst_q = 0;
+ cpi->oxcf.best_allowed_q = MINQ;
+ cpi->oxcf.worst_allowed_q = MAXQ;
+
+ cpi->oxcf.end_usage = USAGE_STREAM_FROM_SERVER;
+ cpi->oxcf.starting_buffer_level = 4;
+ cpi->oxcf.optimal_buffer_level = 5;
+ cpi->oxcf.maximum_buffer_size = 6;
+ cpi->oxcf.under_shoot_pct = 90;
+ cpi->oxcf.allow_df = 0;
+ cpi->oxcf.drop_frames_water_mark = 20;
+
+ cpi->oxcf.allow_spatial_resampling = 0;
+ cpi->oxcf.resample_down_water_mark = 40;
+ cpi->oxcf.resample_up_water_mark = 60;
+
+ cpi->oxcf.fixed_q = cpi->interquantizer;
+
+ cpi->filter_type = NORMAL_LOOPFILTER;
+
+ if (cm->simpler_lpf)
+ cpi->filter_type = SIMPLE_LOOPFILTER;
+
+ cpi->compressor_speed = 1;
+ cpi->horiz_scale = 0;
+ cpi->vert_scale = 0;
+ cpi->oxcf.two_pass_vbrbias = 50;
+ cpi->oxcf.two_pass_vbrmax_section = 400;
+ cpi->oxcf.two_pass_vbrmin_section = 0;
+
+ cpi->oxcf.Sharpness = 0;
+ cpi->oxcf.noise_sensitivity = 0;
+ }
+ else
+ cpi->oxcf = *oxcf;
+
+
+ switch (cpi->oxcf.Mode)
+ {
+
+ case MODE_REALTIME:
+ cpi->pass = 0;
+ cpi->compressor_speed = 2;
+
+ if (cpi->oxcf.cpu_used < -16)
+ {
+ cpi->oxcf.cpu_used = -16;
+ }
+
+ if (cpi->oxcf.cpu_used > 16)
+ cpi->oxcf.cpu_used = 16;
+
+ break;
+
+#if !(CONFIG_REALTIME_ONLY)
+ case MODE_GOODQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5)
+ {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+
+ case MODE_BESTQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 0;
+ break;
+
+ case MODE_FIRSTPASS:
+ cpi->pass = 1;
+ cpi->compressor_speed = 1;
+ break;
+ case MODE_SECONDPASS:
+ cpi->pass = 2;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5)
+ {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+ case MODE_SECONDPASS_BEST:
+ cpi->pass = 2;
+ cpi->compressor_speed = 0;
+ break;
+#endif
+ }
+
+ if (cpi->pass == 0)
+ cpi->auto_worst_q = 1;
+
+ cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+ cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+
+ if (oxcf->fixed_q >= 0)
+ {
+ if (oxcf->worst_allowed_q < 0)
+ cpi->oxcf.fixed_q = q_trans[0];
+ else
+ cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+ if (oxcf->alt_q < 0)
+ cpi->oxcf.alt_q = q_trans[0];
+ else
+ cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+ if (oxcf->key_q < 0)
+ cpi->oxcf.key_q = q_trans[0];
+ else
+ cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+ if (oxcf->gold_q < 0)
+ cpi->oxcf.gold_q = q_trans[0];
+ else
+ cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+ }
+
+ cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+ cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+ //cpi->use_golden_frame_only = 0;
+ //cpi->use_last_frame_only = 0;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->refresh_entropy_probs = 1;
+
+ if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+ cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+ setup_features(cpi);
+
+ {
+ int i;
+
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+ }
+
+ // At the moment the first order values may not be > MAXQ
+ if (cpi->oxcf.fixed_q > MAXQ)
+ cpi->oxcf.fixed_q = MAXQ;
+
+ // local file playback mode == really big buffer
+ if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+ {
+ cpi->oxcf.starting_buffer_level = 60;
+ cpi->oxcf.optimal_buffer_level = 60;
+ cpi->oxcf.maximum_buffer_size = 240;
+
+ }
+
+
+ // Convert target bandwidth from Kbit/s to Bit/s
+ cpi->oxcf.target_bandwidth *= 1000;
+ cpi->oxcf.starting_buffer_level *= cpi->oxcf.target_bandwidth;
+
+ if (cpi->oxcf.optimal_buffer_level == 0)
+ cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.optimal_buffer_level *= cpi->oxcf.target_bandwidth;
+
+ if (cpi->oxcf.maximum_buffer_size == 0)
+ cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.maximum_buffer_size *= cpi->oxcf.target_bandwidth;
+
+ cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+ cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+
+ vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+ cpi->worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
+ cpi->best_quality = cpi->oxcf.best_allowed_q;
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+
+
+ cpi->total_actual_bits = 0;
+ cpi->total_target_vs_actual = 0;
+
+ // Only allow dropped frames in buffered mode
+ cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+ cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
+
+ if (!cm->use_bilinear_mc_filter)
+ cm->mcomp_filter_type = SIXTAP;
+ else
+ cm->mcomp_filter_type = BILINEAR;
+
+ cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+ cm->Width = cpi->oxcf.Width ;
+ cm->Height = cpi->oxcf.Height ;
+
+ cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+
+ cm->horiz_scale = cpi->horiz_scale;
+ cm->vert_scale = cpi->vert_scale ;
+
+ // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+ if (cpi->oxcf.Sharpness > 7)
+ cpi->oxcf.Sharpness = 7;
+
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+ {
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+ // always go to the next whole number
+ cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+ cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+ }
+
+ if (((cm->Width + 15) & 0xfffffff0) != cm->last_frame.y_width ||
+ ((cm->Height + 15) & 0xfffffff0) != cm->last_frame.y_height ||
+ cm->last_frame.y_width == 0)
+ {
+ alloc_raw_frame_buffers(cpi);
+ vp8_alloc_compressor_data(cpi);
+ }
+
+ // Clamp KF frame size to quarter of data rate
+ if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
+ cpi->intra_frame_target = cpi->target_bandwidth >> 2;
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ cpi->last_q[0] = cpi->oxcf.fixed_q;
+ cpi->last_q[1] = cpi->oxcf.fixed_q;
+ }
+
+ cpi->Speed = cpi->oxcf.cpu_used;
+
+ // force to allowlag to 0 if lag_in_frames is 0;
+ if (cpi->oxcf.lag_in_frames == 0)
+ {
+ cpi->oxcf.allow_lag = 0;
+ }
+ // Limit on lag buffers as these are not currently dynamically allocated
+ else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+ cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+ // force play_alternate to 0 if allow_lag is 0, lag_in_frames is too small, Mode is real time or one pass compress enabled.
+ if (cpi->oxcf.allow_lag == 0 || cpi->oxcf.lag_in_frames <= 5 || (cpi->oxcf.Mode < MODE_SECONDPASS))
+ {
+ cpi->oxcf.play_alternate = 0;
+ cpi->ref_frame_flags = cpi->ref_frame_flags & ~VP8_ALT_FLAG;
+ }
+
+ // YX Temp
+ cpi->last_alt_ref_sei = -1;
+ cpi->is_src_frame_alt_ref = 0;
+
+#if 0
+ // Experimental RD Code
+ cpi->frame_distortion = 0;
+ cpi->last_frame_distortion = 0;
+#endif
+
+#if VP8_TEMPORAL_ALT_REF
+ {
+ int i;
+
+ cpi->fixed_divide[0] = 0;
+
+ for (i = 1; i < 255; i++)
+ cpi->fixed_divide[i] = 0x10000 / i;
+ }
+#endif
+}
+
+/*
+ * This function needs more clean up, i.e. be more tuned torwards
+ * change_config rather than init_config !!!!!!!!!!!!!!!!
+ * YX - 5/28/2009
+ *
+ */
+
+void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+ VP8_COMMON *cm = &cpi->common;
+
+ if (!cpi)
+ return;
+
+ if (!oxcf)
+ return;
+
+ if (cm->version != oxcf->Version)
+ {
+ cm->version = oxcf->Version;
+ vp8_setup_version(cm);
+ }
+
+ cpi->oxcf = *oxcf;
+
+ switch (cpi->oxcf.Mode)
+ {
+
+ case MODE_REALTIME:
+ cpi->pass = 0;
+ cpi->compressor_speed = 2;
+
+ if (cpi->oxcf.cpu_used < -16)
+ {
+ cpi->oxcf.cpu_used = -16;
+ }
+
+ if (cpi->oxcf.cpu_used > 16)
+ cpi->oxcf.cpu_used = 16;
+
+ break;
+
+#if !(CONFIG_REALTIME_ONLY)
+ case MODE_GOODQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5)
+ {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+
+ case MODE_BESTQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 0;
+ break;
+
+ case MODE_FIRSTPASS:
+ cpi->pass = 1;
+ cpi->compressor_speed = 1;
+ break;
+ case MODE_SECONDPASS:
+ cpi->pass = 2;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5)
+ {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+ case MODE_SECONDPASS_BEST:
+ cpi->pass = 2;
+ cpi->compressor_speed = 0;
+ break;
+#endif
+ }
+
+ if (cpi->pass == 0)
+ cpi->auto_worst_q = 1;
+
+ cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+ cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+
+ if (oxcf->fixed_q >= 0)
+ {
+ if (oxcf->worst_allowed_q < 0)
+ cpi->oxcf.fixed_q = q_trans[0];
+ else
+ cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+ if (oxcf->alt_q < 0)
+ cpi->oxcf.alt_q = q_trans[0];
+ else
+ cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+ if (oxcf->key_q < 0)
+ cpi->oxcf.key_q = q_trans[0];
+ else
+ cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+ if (oxcf->gold_q < 0)
+ cpi->oxcf.gold_q = q_trans[0];
+ else
+ cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+ }
+
+ cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+
+ cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+ //cpi->use_golden_frame_only = 0;
+ //cpi->use_last_frame_only = 0;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->refresh_entropy_probs = 1;
+
+ if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+ cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+ setup_features(cpi);
+
+ {
+ int i;
+
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+ }
+
+ // At the moment the first order values may not be > MAXQ
+ if (cpi->oxcf.fixed_q > MAXQ)
+ cpi->oxcf.fixed_q = MAXQ;
+
+ // local file playback mode == really big buffer
+ if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+ {
+ cpi->oxcf.starting_buffer_level = 60;
+ cpi->oxcf.optimal_buffer_level = 60;
+ cpi->oxcf.maximum_buffer_size = 240;
+
+ }
+
+ // Convert target bandwidth from Kbit/s to Bit/s
+ cpi->oxcf.target_bandwidth *= 1000;
+
+ cpi->oxcf.starting_buffer_level *= cpi->oxcf.target_bandwidth;
+
+ if (cpi->oxcf.optimal_buffer_level == 0)
+ cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.optimal_buffer_level *= cpi->oxcf.target_bandwidth;
+
+ if (cpi->oxcf.maximum_buffer_size == 0)
+ cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.maximum_buffer_size *= cpi->oxcf.target_bandwidth;
+
+ cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+ cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+
+ vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+ cpi->worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
+ cpi->best_quality = cpi->oxcf.best_allowed_q;
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+
+
+ cpi->total_actual_bits = 0;
+ cpi->total_target_vs_actual = 0;
+
+ // Only allow dropped frames in buffered mode
+ cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+ cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
+
+ if (!cm->use_bilinear_mc_filter)
+ cm->mcomp_filter_type = SIXTAP;
+ else
+ cm->mcomp_filter_type = BILINEAR;
+
+ cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+ cm->Width = cpi->oxcf.Width ;
+ cm->Height = cpi->oxcf.Height ;
+
+ cm->horiz_scale = cpi->horiz_scale;
+ cm->vert_scale = cpi->vert_scale ;
+
+ cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+
+ // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+ if (cpi->oxcf.Sharpness > 7)
+ cpi->oxcf.Sharpness = 7;
+
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+ {
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+ // always go to the next whole number
+ cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+ cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+ }
+
+ if (((cm->Width + 15) & 0xfffffff0) != cm->last_frame.y_width ||
+ ((cm->Height + 15) & 0xfffffff0) != cm->last_frame.y_height ||
+ cm->last_frame.y_width == 0)
+ {
+ alloc_raw_frame_buffers(cpi);
+ vp8_alloc_compressor_data(cpi);
+ }
+
+ // Clamp KF frame size to quarter of data rate
+ if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
+ cpi->intra_frame_target = cpi->target_bandwidth >> 2;
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ cpi->last_q[0] = cpi->oxcf.fixed_q;
+ cpi->last_q[1] = cpi->oxcf.fixed_q;
+ }
+
+ cpi->Speed = cpi->oxcf.cpu_used;
+
+ // force to allowlag to 0 if lag_in_frames is 0;
+ if (cpi->oxcf.lag_in_frames == 0)
+ {
+ cpi->oxcf.allow_lag = 0;
+ }
+ // Limit on lag buffers as these are not currently dynamically allocated
+ else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+ cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+ // force play_alternate to 0 if allow_lag is 0, lag_in_frames is too small, Mode is real time or one pass compress enabled.
+ if (cpi->oxcf.allow_lag == 0 || cpi->oxcf.lag_in_frames <= 5 || (cpi->oxcf.Mode < MODE_SECONDPASS))
+ {
+ cpi->oxcf.play_alternate = 0;
+ cpi->ref_frame_flags = cpi->ref_frame_flags & ~VP8_ALT_FLAG;
+ }
+
+ // YX Temp
+ cpi->last_alt_ref_sei = -1;
+ cpi->is_src_frame_alt_ref = 0;
+
+#if 0
+ // Experimental RD Code
+ cpi->frame_distortion = 0;
+ cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#define M_LOG2_E 0.693147180559945309417
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+static void cal_mvsadcosts(int *mvsadcost[2])
+{
+ int i = 1;
+
+ mvsadcost [0] [0] = 300;
+ mvsadcost [1] [0] = 300;
+
+ do
+ {
+ double z = 256 * (2 * (log2f(2 * i) + .6));
+ mvsadcost [0][i] = (int) z;
+ mvsadcost [1][i] = (int) z;
+ mvsadcost [0][-i] = (int) z;
+ mvsadcost [1][-i] = (int) z;
+ }
+ while (++i <= mv_max);
+}
+
+VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
+{
+ int i;
+ volatile union
+ {
+ VP8_COMP *cpi;
+ VP8_PTR ptr;
+ } ctx;
+
+ VP8_COMP *cpi;
+ VP8_COMMON *cm;
+
+ cpi = ctx.cpi = vpx_memalign(32, sizeof(VP8_COMP));
+ // Check that the CPI instance is valid
+ if (!cpi)
+ return 0;
+
+ cm = &cpi->common;
+
+ vpx_memset(cpi, 0, sizeof(VP8_COMP));
+
+ if (setjmp(cm->error.jmp))
+ {
+ VP8_PTR ptr = ctx.ptr;
+
+ ctx.cpi->common.error.setjmp = 0;
+ vp8_remove_compressor(&ptr);
+ return 0;
+ }
+
+ cpi->common.error.setjmp = 1;
+
+ CHECK_MEM_ERROR(cpi->rdtok, vpx_calloc(256 * 3 / 2, sizeof(TOKENEXTRA)));
+ CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+ vp8_cmachine_specific_config(cpi);
+ vp8_create_common(&cpi->common);
+
+ vp8_init_config((VP8_PTR)cpi, oxcf);
+
+ memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
+ cpi->common.current_video_frame = 0;
+ cpi->kf_overspend_bits = 0;
+ cpi->kf_bitrate_adjustment = 0;
+ cpi->frames_till_gf_update_due = 0;
+ cpi->gf_overspend_bits = 0;
+ cpi->non_gf_bitrate_adjustment = 0;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ cpi->prob_intra_coded = 63;
+
+ // Prime the recent reference frame useage counters.
+ // Hereafter they will be maintained as a sort of moving average
+ cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+ cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+ // Set reference frame sign bias for ALTREF frame to 1 (for now)
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+ cpi->gf_decay_rate = 0;
+ cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+ cpi->gold_is_last = 0 ;
+ cpi->alt_is_last = 0 ;
+ cpi->gold_is_alt = 0 ;
+
+
+
+ // Create the encoder segmentation map and set all entries to 0
+ CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+ CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+ vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+ cpi->active_map_enabled = 0;
+
+ // Create the first pass motion map structure and set to 0
+ CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(cpi->common.MBs, 1));
+
+#if 0
+ // Experimental code for lagged and one pass
+ // Initialise one_pass GF frames stats
+ // Update stats used for GF selection
+ if (cpi->pass == 0)
+ {
+ cpi->one_pass_frame_index = 0;
+
+ for (i = 0; i < MAX_LAG_BUFFERS; i++)
+ {
+ cpi->one_pass_frame_stats[i].frames_so_far = 0;
+ cpi->one_pass_frame_stats[i].frame_intra_error = 0.0;
+ cpi->one_pass_frame_stats[i].frame_coded_error = 0.0;
+ cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0;
+ cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvr = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvc = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0;
+ }
+ }
+#endif
+
+ // Should we use the cyclic refresh method.
+ // Currently this is tied to error resilliant mode
+ cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+ cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 40;
+ cpi->cyclic_refresh_mode_index = 0;
+ cpi->cyclic_refresh_q = 32;
+
+ if (cpi->cyclic_refresh_mode_enabled)
+ {
+ CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+ }
+ else
+ cpi->cyclic_refresh_map = (signed char *) NULL;
+
+ // Test function for segmentation
+ //segmentation_test_function((VP8_PTR) cpi);
+
+ // Loop filter mode / ref deltas test function
+ //mode_ref_lf_test_function(cpi);
+
+#ifdef ENTROPY_STATS
+ init_context_counters();
+#endif
+
+
+#ifdef INTRARDOPT
+ cpi->intra_rd_opt = 1;
+
+#endif
+
+ cpi->frames_since_key = 8; // Give a sensible default for the first frame.
+ cpi->key_frame_frequency = cpi->oxcf.key_freq;
+
+ cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_active = FALSE;
+ cpi->common.refresh_alt_ref_frame = 0;
+
+ cpi->b_calculate_psnr = CONFIG_PSNR;
+#if CONFIG_PSNR
+ cpi->b_calculate_ssimg = 0;
+
+ cpi->count = 0;
+ cpi->bytes = 0;
+
+ if (cpi->b_calculate_psnr)
+ {
+ cpi->total_sq_error = 0.0;
+ cpi->total_sq_error2 = 0.0;
+ cpi->total_y = 0.0;
+ cpi->total_u = 0.0;
+ cpi->total_v = 0.0;
+ cpi->total = 0.0;
+ cpi->totalp_y = 0.0;
+ cpi->totalp_u = 0.0;
+ cpi->totalp_v = 0.0;
+ cpi->totalp = 0.0;
+ cpi->tot_recode_hits = 0;
+ cpi->summed_quality = 0;
+ cpi->summed_weights = 0;
+ }
+
+ if (cpi->b_calculate_ssimg)
+ {
+ cpi->total_ssimg_y = 0;
+ cpi->total_ssimg_u = 0;
+ cpi->total_ssimg_v = 0;
+ cpi->total_ssimg_all = 0;
+ }
+
+#ifndef LLONG_MAX
+#define LLONG_MAX 9223372036854775807LL
+#endif
+ cpi->first_time_stamp_ever = LLONG_MAX;
+
+#endif
+
+ cpi->frames_till_gf_update_due = 0;
+ cpi->key_frame_count = 1;
+ cpi->tot_key_frame_bits = 0;
+
+ cpi->ni_av_qi = cpi->oxcf.worst_allowed_q;
+ cpi->ni_tot_qi = 0;
+ cpi->ni_frames = 0;
+ cpi->total_byte_count = 0;
+
+ cpi->drop_frame = 0;
+ cpi->drop_count = 0;
+ cpi->max_drop_count = 0;
+ cpi->max_consec_dropped_frames = 4;
+
+ cpi->rate_correction_factor = 1.0;
+ cpi->key_frame_rate_correction_factor = 1.0;
+ cpi->gf_rate_correction_factor = 1.0;
+ cpi->est_max_qcorrection_factor = 1.0;
+
+ cpi->mb.mvcost[0] = &cpi->mb.mvcosts[0][mv_max+1];
+ cpi->mb.mvcost[1] = &cpi->mb.mvcosts[1][mv_max+1];
+ cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mv_max+1];
+ cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mv_max+1];
+
+ cal_mvsadcosts(cpi->mb.mvsadcost);
+
+ for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+ {
+ cpi->prior_key_frame_size[i] = cpi->intra_frame_target;
+ cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+ }
+
+ cpi->check_freq[0] = 15;
+ cpi->check_freq[1] = 15;
+
+#ifdef OUTPUT_YUV_SRC
+ yuv_file = fopen("bd.yuv", "ab");
+#endif
+
+#if 0
+ framepsnr = fopen("framepsnr.stt", "a");
+ kf_list = fopen("kf_list.stt", "w");
+#endif
+
+ cpi->output_pkt_list = oxcf->output_pkt_list;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 1)
+ {
+ vp8_init_first_pass(cpi);
+ }
+ else if (cpi->pass == 2)
+ {
+ cpi->stats_in = oxcf->two_pass_stats_in.buf;
+ cpi->stats_in_end = cpi->stats_in
+ + oxcf->two_pass_stats_in.sz / sizeof(FIRSTPASS_STATS)
+ - 1;
+ vp8_init_second_pass(cpi);
+ }
+
+#endif
+
+ if (cpi->compressor_speed == 2)
+ {
+ cpi->cpu_freq = 0; //vp8_get_processor_freq();
+ cpi->avg_encode_time = 0;
+ cpi->avg_pick_mode_time = 0;
+ }
+
+ vp8_set_speed_features(cpi);
+
+ // Set starting values of RD threshold multipliers (128 = *1)
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ cpi->rd_thresh_mult[i] = 128;
+ }
+
+#ifdef ENTROPY_STATS
+ init_mv_ref_counts();
+#endif
+
+ vp8cx_create_encoder_threads(cpi);
+
+ cpi->fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
+ cpi->fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
+ cpi->fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
+ cpi->fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
+ cpi->fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
+
+#if !(CONFIG_REALTIME_ONLY)
+ cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
+#endif
+ cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
+
+ cpi->ready_for_new_frame = 1;
+
+ cpi->source_encode_index = 0;
+
+ // make sure frame 1 is okay
+ cpi->error_bins[0] = cpi->common.MBs;
+
+ //vp8cx_init_quantizer() is first called here. Add check in vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only called later
+ //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame.
+ vp8cx_init_quantizer(cpi);
+ {
+ vp8_init_loop_filter(cm);
+ cm->last_frame_type = KEY_FRAME;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+ cpi->common.error.setjmp = 0;
+ return (VP8_PTR) cpi;
+
+}
+
+
+void vp8_remove_compressor(VP8_PTR *ptr)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(*ptr);
+
+ if (!cpi)
+ return;
+
+ if (cpi && (cpi->common.current_video_frame > 0))
+ {
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 2)
+ {
+ vp8_end_second_pass(cpi);
+ }
+
+#endif
+
+#ifdef ENTROPY_STATS
+ print_context_counters();
+ print_tree_update_probs();
+ print_mode_context();
+#endif
+
+#if CONFIG_PSNR
+
+ if (cpi->pass != 1)
+ {
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded = (cpi->source_end_time_stamp - cpi->first_time_stamp_ever) / 10000000.000;
+ double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+ double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+
+ if (cpi->b_calculate_psnr)
+ {
+ double samples = 3.0 / 2 * cpi->count * cpi->common.last_frame.y_width * cpi->common.last_frame.y_height;
+ double total_psnr = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error);
+ double total_psnr2 = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error2);
+ double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+
+ fprintf(f, "Bitrate\AVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(us)\n");
+ fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f %8.0f\n",
+ dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+ total_encode_time);
+ }
+
+ if (cpi->b_calculate_ssimg)
+ {
+ fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(us)\n");
+ fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+ cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
+ cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
+ }
+
+ fclose(f);
+#if 0
+ f = fopen("qskip.stt", "a");
+ fprintf(f, "minq:%d -maxq:%d skipture:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
+ fclose(f);
+#endif
+
+ }
+
+#endif
+
+
+#ifdef SPEEDSTATS
+
+ if (cpi->compressor_speed == 2)
+ {
+ int i;
+ FILE *f = fopen("cxspeed.stt", "a");
+ cnt_pm /= cpi->common.MBs;
+
+ for (i = 0; i < 16; i++)
+ fprintf(f, "%5d", frames_at_speed[i]);
+
+ fprintf(f, "\n");
+ //fprintf(f, "%10d PM %10d %10d %10d EF %10d %10d %10d\n", cpi->Speed, cpi->avg_pick_mode_time, (tot_pm/cnt_pm), cnt_pm, cpi->avg_encode_time, 0, 0);
+ fclose(f);
+ }
+
+#endif
+
+
+#ifdef MODE_STATS
+ {
+ extern int count_mb_seg[4];
+ FILE *f = fopen("modes.stt", "a");
+ double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+ fprintf(f, "intra_mode in Intra Frames:\n");
+ fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
+ fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
+ fprintf(f, "B: ");
+ {
+ int i;
+
+ for (i = 0; i < 10; i++)
+ fprintf(f, "%8d, ", b_modes[i]);
+
+ fprintf(f, "\n");
+
+ }
+
+ fprintf(f, "Modes in Inter Frames:\n");
+ fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n",
+ inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], inter_y_modes[3], inter_y_modes[4],
+ inter_y_modes[5], inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], inter_y_modes[9]);
+ fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]);
+ fprintf(f, "B: ");
+ {
+ int i;
+
+ for (i = 0; i < 15; i++)
+ fprintf(f, "%8d, ", inter_b_modes[i]);
+
+ fprintf(f, "\n");
+
+ }
+ fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+ fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+
+
+
+ fclose(f);
+ }
+#endif
+
+#ifdef ENTROPY_STATS
+ {
+ int i, j, k;
+ FILE *fmode = fopen("modecontext.c", "w");
+
+ fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+ fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
+ fprintf(fmode, "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
+
+ for (i = 0; i < 10; i++)
+ {
+
+ fprintf(fmode, " { //Above Mode : %d\n", i);
+
+ for (j = 0; j < 10; j++)
+ {
+
+ fprintf(fmode, " {");
+
+ for (k = 0; k < 10; k++)
+ {
+ if (!intra_mode_stats[i][j][k])
+ fprintf(fmode, " %5d, ", 1);
+ else
+ fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+ }
+
+ fprintf(fmode, "}, // left_mode %d\n", j);
+
+ }
+
+ fprintf(fmode, " },\n");
+
+ }
+
+ fprintf(fmode, "};\n");
+ }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+ if (0)
+ {
+ int i;
+ FILE *f = fopen("tokenbits.stt", "a");
+
+ for (i = 0; i < 28; i++)
+ fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+ fprintf(f, "\n");
+ fclose(f);
+ }
+
+#endif
+
+#if 0
+ {
+ printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+ printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
+ printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+ }
+#endif
+
+ }
+
+ vp8cx_remove_encoder_threads(cpi);
+
+ vp8_dealloc_compressor_data(cpi);
+ vpx_free(cpi->mb.ss);
+ vpx_free(cpi->tok);
+ vpx_free(cpi->rdtok);
+ vpx_free(cpi->cyclic_refresh_map);
+
+ vp8_remove_common(&cpi->common);
+ vpx_free(cpi);
+ *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+ fclose(yuv_file);
+#endif
+
+#if 0
+
+ if (keyfile)
+ fclose(keyfile);
+
+ if (framepsnr)
+ fclose(framepsnr);
+
+ if (kf_list)
+ fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+ unsigned char *recon, int recon_stride,
+ unsigned int cols, unsigned int rows,
+ vp8_variance_rtcd_vtable_t *rtcd)
+{
+ unsigned int row, col;
+ uint64_t total_sse = 0;
+ int diff;
+
+ for (row = 0; row + 16 <= rows; row += 16)
+ {
+ for (col = 0; col + 16 <= cols; col += 16)
+ {
+ unsigned int sse;
+
+ VARIANCE_INVOKE(rtcd, mse16x16)(orig + col, orig_stride,
+ recon + col, recon_stride,
+ &sse);
+ total_sse += sse;
+ }
+
+ /* Handle odd-sized width */
+ if (col < cols)
+ {
+ unsigned int border_row, border_col;
+ unsigned char *border_orig = orig;
+ unsigned char *border_recon = recon;
+
+ for (border_row = 0; border_row < 16; border_row++)
+ {
+ for (border_col = col; border_col < cols; border_col++)
+ {
+ diff = border_orig[border_col] - border_recon[border_col];
+ total_sse += diff * diff;
+ }
+
+ border_orig += orig_stride;
+ border_recon += recon_stride;
+ }
+ }
+
+ orig += orig_stride * 16;
+ recon += recon_stride * 16;
+ }
+
+ /* Handle odd-sized height */
+ for (; row < rows; row++)
+ {
+ for (col = 0; col < cols; col++)
+ {
+ diff = orig[col] - recon[col];
+ total_sse += diff * diff;
+ }
+
+ orig += orig_stride;
+ recon += recon_stride;
+ }
+
+ return total_sse;
+}
+
+
+static void generate_psnr_packet(VP8_COMP *cpi)
+{
+ YV12_BUFFER_CONFIG *orig = cpi->Source;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ struct vpx_codec_cx_pkt pkt;
+ uint64_t sse;
+ int i;
+ unsigned int width = cpi->common.Width;
+ unsigned int height = cpi->common.Height;
+
+ pkt.kind = VPX_CODEC_PSNR_PKT;
+ sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+ recon->y_buffer, recon->y_stride,
+ width, height,
+ IF_RTCD(&cpi->rtcd.variance));
+ pkt.data.psnr.sse[0] = sse;
+ pkt.data.psnr.sse[1] = sse;
+ pkt.data.psnr.samples[0] = width * height;
+ pkt.data.psnr.samples[1] = width * height;
+
+ width = (width + 1) / 2;
+ height = (height + 1) / 2;
+
+ sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ recon->u_buffer, recon->uv_stride,
+ width, height,
+ IF_RTCD(&cpi->rtcd.variance));
+ pkt.data.psnr.sse[0] += sse;
+ pkt.data.psnr.sse[2] = sse;
+ pkt.data.psnr.samples[0] += width * height;
+ pkt.data.psnr.samples[2] = width * height;
+
+ sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ recon->v_buffer, recon->uv_stride,
+ width, height,
+ IF_RTCD(&cpi->rtcd.variance));
+ pkt.data.psnr.sse[0] += sse;
+ pkt.data.psnr.sse[3] = sse;
+ pkt.data.psnr.samples[0] += width * height;
+ pkt.data.psnr.samples[3] = width * height;
+
+ for (i = 0; i < 4; i++)
+ pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
+ pkt.data.psnr.sse[i]);
+
+ vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp8_use_as_reference(VP8_PTR ptr, int ref_frame_flags)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ if (ref_frame_flags > 7)
+ return -1 ;
+
+ cpi->ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+ if (ref_frame_flags > 7)
+ return -1 ;
+
+ cpi->common.refresh_golden_frame = 0;
+ cpi->common.refresh_alt_ref_frame = 0;
+ cpi->common.refresh_last_frame = 0;
+
+ if (ref_frame_flags & VP8_LAST_FLAG)
+ cpi->common.refresh_last_frame = 1;
+
+ if (ref_frame_flags & VP8_GOLD_FLAG)
+ cpi->common.refresh_golden_frame = 1;
+
+ if (ref_frame_flags & VP8_ALT_FLAG)
+ cpi->common.refresh_alt_ref_frame = 1;
+
+ return 0;
+}
+
+int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+ VP8_COMMON *cm = &cpi->common;
+
+ if (ref_frame_flag == VP8_LAST_FLAG)
+ vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
+
+ else if (ref_frame_flag == VP8_GOLD_FLAG)
+ vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
+
+ else if (ref_frame_flag == VP8_ALT_FLAG)
+ vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
+
+ else
+ return -1;
+
+ return 0;
+}
+int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(ptr);
+ VP8_COMMON *cm = &cpi->common;
+
+ if (ref_frame_flag == VP8_LAST_FLAG)
+ vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
+
+ else if (ref_frame_flag == VP8_GOLD_FLAG)
+ vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
+
+ else if (ref_frame_flag == VP8_ALT_FLAG)
+ vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
+
+ else
+ return -1;
+
+ return 0;
+}
+int vp8_update_entropy(VP8_PTR comp, int update)
+{
+ VP8_COMP *cpi = (VP8_COMP *) comp;
+ VP8_COMMON *cm = &cpi->common;
+ cm->refresh_entropy_probs = update;
+
+ return 0;
+}
+
+void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
+{
+ FILE *yuv_file = fopen(name, "ab");
+ unsigned char *src = s->y_buffer;
+ int h = s->y_height;
+
+ do
+ {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ }
+ while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do
+ {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ }
+ while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do
+ {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ }
+ while (--h);
+
+ fclose(yuv_file);
+}
+
+static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ // are we resizing the image
+ if (cm->horiz_scale != 0 || cm->vert_scale != 0)
+ {
+#if CONFIG_SPATIAL_RESAMPLING
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+ int tmp_height;
+
+ if (cm->vert_scale == 3)
+ tmp_height = 9;
+ else
+ tmp_height = 11;
+
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+ vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+ tmp_height, hs, hr, vs, vr, 0);
+
+ cpi->Source = &cpi->scaled_source;
+#endif
+ }
+ // we may need to copy to a buffer so we can extend the image...
+ else if (cm->Width != cm->last_frame.y_width ||
+ cm->Height != cm->last_frame.y_height)
+ {
+ //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
+#if HAVE_ARMV7
+ vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);
+#else
+ vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
+#endif
+
+ cpi->Source = &cpi->scaled_source;
+ }
+
+ vp8_extend_to_multiple_of16(cpi->Source, cm->Width, cm->Height);
+
+}
+static void resize_key_frame(VP8_COMP *cpi)
+{
+#if CONFIG_SPATIAL_RESAMPLING
+ VP8_COMMON *cm = &cpi->common;
+
+ // Do we need to apply resampling for one pass cbr.
+ // In one pass this is more limited than in two pass cbr
+ // The test and any change is only made one per key frame sequence
+ if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
+ {
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+ int new_width, new_height;
+
+ // If we are below the resample DOWN watermark then scale down a notch.
+ if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+ {
+ cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
+ cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+ }
+ // Should we now start scaling back up
+ else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+ {
+ cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
+ cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+ }
+
+ // Get the new hieght and width
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+ new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+ new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+ // If the image size has changed we need to reallocate the buffers
+ // and resample the source image
+ if ((cm->Width != new_width) || (cm->Height != new_height))
+ {
+ cm->Width = new_width;
+ cm->Height = new_height;
+ vp8_alloc_compressor_data(cpi);
+ scale_and_extend_source(cpi->un_scaled_source, cpi);
+ }
+ }
+
+#endif
+}
+// return of 0 means drop frame
+static int pick_frame_size(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ // First Frame is a special case
+ if (cm->current_video_frame == 0)
+ {
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 2)
+ vp8_calc_auto_iframe_target_size(cpi);
+
+ // 1 Pass there is no information on which to base size so use bandwidth per second * fixed fraction
+ else
+#endif
+ cpi->this_frame_target = cpi->oxcf.target_bandwidth / 2;
+
+ // in error resilient mode the first frame is bigger since it likely contains
+ // all the static background
+ if (cpi->oxcf.error_resilient_mode == 1 || (cpi->compressor_speed == 2))
+ {
+ cpi->this_frame_target *= 3; // 5;
+ }
+
+ // Key frame from VFW/auto-keyframe/first frame
+ cm->frame_type = KEY_FRAME;
+
+ }
+ // Auto key frames (Only two pass will enter here)
+ else if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_calc_auto_iframe_target_size(cpi);
+ }
+ // Forced key frames (by interval or an external signal)
+ else if ((cm->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+ {
+ // Key frame from VFW/auto-keyframe/first frame
+ cm->frame_type = KEY_FRAME;
+
+ resize_key_frame(cpi);
+
+ // Compute target frame size
+ if (cpi->pass != 2)
+ vp8_calc_iframe_target_size(cpi);
+ }
+ else
+ {
+ // INTER frame: compute target frame size
+ cm->frame_type = INTER_FRAME;
+ vp8_calc_pframe_target_size(cpi);
+
+ // Check if we're dropping the frame:
+ if (cpi->drop_frame)
+ {
+ cpi->drop_frame = FALSE;
+ cpi->drop_count++;
+ return 0;
+ }
+ }
+
+ // Note target_size in bits * 256 per MB
+ cpi->target_bits_per_mb = (cpi->this_frame_target * 256) / cpi->common.MBs;
+
+ return 1;
+}
+static void set_quantizer(VP8_COMP *cpi, int Q)
+{
+ VP8_COMMON *cm = &cpi->common;
+ MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+
+ cm->base_qindex = Q;
+
+ cm->y1dc_delta_q = 0;
+ cm->y2dc_delta_q = 0;
+ cm->y2ac_delta_q = 0;
+ cm->uvdc_delta_q = 0;
+ cm->uvac_delta_q = 0;
+
+ // Set Segment specific quatizers
+ mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
+ mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
+ mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
+ mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+}
+
+static void update_alt_ref_frame_and_stats(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ // Update the golden frame buffer
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+
+ // Select an interval before next GF or altref
+ if (!cpi->auto_gold)
+ cpi->frames_till_gf_update_due = cpi->goldfreq;
+
+ if ((cpi->pass != 2) && cpi->frames_till_gf_update_due)
+ {
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+ // Set the bits per frame that we should try and recover in subsequent inter frames
+ // to account for the extra GF spend... note that his does not apply for GF updates
+ // that occur coincident with a key frame as the extra cost of key frames is dealt
+ // with elsewhere.
+
+ cpi->gf_overspend_bits += cpi->projected_frame_size;
+ cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+ }
+
+ // Update data structure that monitors level of reference to last GF
+ vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+ // this frame refreshes means next frames don't unless specified by user
+
+ cpi->common.frames_since_golden = 0;
+
+ // Clear the alternate reference update pending flag.
+ cpi->source_alt_ref_pending = FALSE;
+
+ // Set the alternate refernce frame active flag
+ cpi->source_alt_ref_active = TRUE;
+
+
+}
+static void update_golden_frame_and_stats(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ // Update the Golden frame reconstruction buffer if signalled and the GF usage counts.
+ if (cm->refresh_golden_frame)
+ {
+ // Update the golden frame buffer
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+
+ // Select an interval before next GF
+ if (!cpi->auto_gold)
+ cpi->frames_till_gf_update_due = cpi->goldfreq;
+
+ if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0))
+ {
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+ // Set the bits per frame that we should try and recover in subsequent inter frames
+ // to account for the extra GF spend... note that his does not apply for GF updates
+ // that occur coincident with a key frame as the extra cost of key frames is dealt
+ // with elsewhere.
+ if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active)
+ {
+ // Calcluate GF bits to be recovered
+ // Projected size - av frame bits available for inter frames for clip as a whole
+ cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target);
+ }
+
+ cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+
+ }
+
+ // Update data structure that monitors level of reference to last GF
+ vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ // this frame refreshes means next frames don't unless specified by user
+ cm->refresh_golden_frame = 0;
+ cpi->common.frames_since_golden = 0;
+
+ //if ( cm->frame_type == KEY_FRAME )
+ //{
+ cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+ cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+ //}
+ //else
+ //{
+ // // Carry a potrtion of count over to begining of next gf sequence
+ // cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
+ // cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
+ // cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
+ // cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
+ //}
+
+ // ******** Fixed Q test code only ************
+ // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+ if (cpi->oxcf.fixed_q >= 0 &&
+ cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->source_alt_ref_pending = TRUE;
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ }
+
+ if (!cpi->source_alt_ref_pending)
+ cpi->source_alt_ref_active = FALSE;
+
+ // Decrement count down till next gf
+ if (cpi->frames_till_gf_update_due > 0)
+ cpi->frames_till_gf_update_due--;
+
+ }
+ else if (!cpi->common.refresh_alt_ref_frame)
+ {
+ // Decrement count down till next gf
+ if (cpi->frames_till_gf_update_due > 0)
+ cpi->frames_till_gf_update_due--;
+
+ if (cpi->common.frames_till_alt_ref_frame)
+ cpi->common.frames_till_alt_ref_frame --;
+
+ cpi->common.frames_since_golden ++;
+
+ if (cpi->common.frames_since_golden > 1)
+ {
+ cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
+ cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+ }
+ }
+}
+
+// This function updates the reference frame probability estimates that
+// will be used during mode selection
+static void update_rd_ref_frame_probs(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+#if 0
+ const int *const rfct = cpi->recent_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->prob_intra_coded = 255;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+ else if (!(rf_intra + rf_inter))
+ {
+ // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+ else
+ {
+ cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
+
+ if (cpi->prob_intra_coded < 1)
+ cpi->prob_intra_coded = 1;
+
+ if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
+ {
+ cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ if (cpi->prob_last_coded < 1)
+ cpi->prob_last_coded = 1;
+
+ cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ if (cpi->prob_gf_coded < 1)
+ cpi->prob_gf_coded = 1;
+ }
+ }
+
+#else
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->prob_intra_coded = 255;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+ else if (!(rf_intra + rf_inter))
+ {
+ // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+ else
+ {
+ cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
+
+ if (cpi->prob_intra_coded < 1)
+ cpi->prob_intra_coded = 1;
+
+ cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ if (cpi->prob_last_coded < 1)
+ cpi->prob_last_coded = 1;
+
+ cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ if (cpi->prob_gf_coded < 1)
+ cpi->prob_gf_coded = 1;
+ }
+
+ // update reference frame costs since we can do better than what we got last frame.
+
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->prob_intra_coded += 40;
+ cpi->prob_last_coded = 200;
+ cpi->prob_gf_coded = 1;
+ }
+ else if (cpi->common.frames_since_golden == 0)
+ {
+ cpi->prob_last_coded = 214;
+ cpi->prob_gf_coded = 1;
+ }
+ else if (cpi->common.frames_since_golden == 1)
+ {
+ cpi->prob_last_coded = 192;
+ cpi->prob_gf_coded = 220;
+ }
+ else if (cpi->source_alt_ref_active)
+ {
+ //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden;
+ cpi->prob_gf_coded -= 20;
+
+ if (cpi->prob_gf_coded < 10)
+ cpi->prob_gf_coded = 10;
+ }
+
+#endif
+}
+
+
+// 1 = key, 0 = inter
+static int decide_key_frame(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int code_key_frame = FALSE;
+
+ cpi->kf_boost = 0;
+
+ if (cpi->Speed > 11)
+ return FALSE;
+
+ // Clear down mmx registers
+ vp8_clear_system_state(); //__asm emms;
+
+ if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
+ {
+ double change = 1.0 * abs((int)(cpi->intra_error - cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+ double change2 = 1.0 * abs((int)(cpi->prediction_error - cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+ double minerror = cm->MBs * 256;
+
+#if 0
+
+ if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
+ && cpi->prediction_error > minerror
+ && (change > .25 || change2 > .25))
+ {
+ FILE *f = fopen("intra_inter.stt", "a");
+
+ if (cpi->prediction_error <= 0)
+ cpi->prediction_error = 1;
+
+ fprintf(f, "%d %d %d %d %14.4f\n",
+ cm->current_video_frame,
+ (int) cpi->prediction_error,
+ (int) cpi->intra_error,
+ (int)((10 * cpi->intra_error) / cpi->prediction_error),
+ change);
+
+ fclose(f);
+ }
+
+#endif
+
+ cpi->last_intra_error = cpi->intra_error;
+ cpi->last_prediction_error = cpi->prediction_error;
+
+ if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
+ && cpi->prediction_error > minerror
+ && (change > .25 || change2 > .25))
+ {
+ /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
+ return TRUE;
+ }
+
+ return FALSE;
+
+ }
+
+ // If the following are true we might as well code a key frame
+ if (((cpi->this_frame_percent_intra == 100) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) ||
+ ((cpi->this_frame_percent_intra > 95) &&
+ (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
+ {
+ code_key_frame = TRUE;
+ }
+ // in addition if the following are true and this is not a golden frame then code a key frame
+ // Note that on golden frames there often seems to be a pop in intra useage anyway hence this
+ // restriction is designed to prevent spurious key frames. The Intra pop needs to be investigated.
+ else if (((cpi->this_frame_percent_intra > 60) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) ||
+ ((cpi->this_frame_percent_intra > 75) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 3 / 2))) ||
+ ((cpi->this_frame_percent_intra > 90) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
+ {
+ if (!cm->refresh_golden_frame)
+ code_key_frame = TRUE;
+ }
+
+ return code_key_frame;
+
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+ (void) size;
+ (void) dest;
+ (void) frame_flags;
+ set_quantizer(cpi, 26);
+
+ scale_and_extend_source(cpi->un_scaled_source, cpi);
+ vp8_first_pass(cpi);
+}
+#endif
+
+#if 0
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
+{
+
+ // write the frame
+ FILE *yframe;
+ int i;
+ char filename[255];
+
+ sprintf(filename, "cx\\y%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->y_height; i++)
+ fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "cx\\u%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "cx\\v%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+}
+#endif
+// return of 0 means drop frame
+
+#if VP8_TEMPORAL_ALT_REF
+static void vp8cx_temp_blur1_c
+(
+ unsigned char **frames,
+ int frame_count,
+ unsigned char *src,
+ unsigned char *dst,
+ int width,
+ int stride,
+ int height,
+ int strength,
+ int *fixed_divide,
+ unsigned char *motion_map_ptr,
+ unsigned char block_size
+)
+{
+ int byte = 0; // Buffer offset for the current pixel value being filtered
+ int frame = 0;
+ int modifier = 0;
+ int i, j, k;
+ int block_ofset;
+ int Cols, Rows;
+ unsigned char Shift = (block_size == 16) ? 4 : 3;
+
+ Cols = width / block_size;
+ Rows = height / block_size;
+
+ for (i = 0; i < height; i++)
+ {
+ block_ofset = (i >> Shift) * Cols;
+
+ for (j = 0; j < Cols; j ++)
+ {
+ if (motion_map_ptr[block_ofset] > 2)
+ {
+ vpx_memcpy(&dst[byte], &src[byte], block_size);
+ byte += block_size;
+ }
+ else
+ {
+ for (k = 0; k < block_size; k++)
+ {
+ int accumulator = 0;
+ int count = 0;
+ int src_byte = src[byte];
+
+ for (frame = 0; frame < frame_count; frame++)
+ {
+ // get current frame pixel value
+ int pixel_value = frames[frame][byte]; // int pixel_value = *frameptr;
+
+ modifier = src_byte; // modifier = s[byte];
+ modifier -= pixel_value;
+ modifier *= modifier;
+ modifier >>= strength;
+ modifier *= 3;
+
+ if (modifier > 16)
+ modifier = 16;
+
+ modifier = 16 - modifier;
+
+ accumulator += modifier * pixel_value;
+
+ count += modifier;
+ }
+
+ accumulator += (count >> 1);
+ accumulator *= fixed_divide[count]; // accumulator *= ppi->fixed_divide[count];
+ accumulator >>= 16;
+
+ dst[byte] = accumulator; // d[byte] = accumulator;
+
+ // move to next pixel
+ byte++;
+ }
+ }
+
+ block_ofset++;
+ }
+
+ // Step byte on over the UMV border to the start of the next line
+ byte += stride - width;
+ }
+}
+
+static void vp8cx_temp_filter_c
+(
+ VP8_COMP *cpi
+)
+{
+ YV12_BUFFER_CONFIG *temp_source_buffer;
+ int *fixed_divide = cpi->fixed_divide;
+
+ int frame = 0;
+ int max_frames = 11;
+
+ int num_frames_backward = 0;
+ int num_frames_forward = 0;
+ int frames_to_blur_backward = 0;
+ int frames_to_blur_forward = 0;
+ int frames_to_blur = 0;
+ int start_frame = 0;
+
+ int strength = cpi->oxcf.arnr_strength;
+
+ int blur_type = cpi->oxcf.arnr_type;
+
+ int new_max_frames = cpi->oxcf.arnr_max_frames;
+
+ if (new_max_frames > 0)
+ max_frames = new_max_frames;
+
+ num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index;
+
+ if (num_frames_backward < 0)
+ num_frames_backward += cpi->oxcf.lag_in_frames;
+
+ num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1);
+
+ switch (blur_type)
+ {
+ case 1:
+ /////////////////////////////////////////
+ // Backward Blur
+
+ frames_to_blur_backward = num_frames_backward;
+
+ if (frames_to_blur_backward >= max_frames)
+ frames_to_blur_backward = max_frames - 1;
+
+ frames_to_blur = frames_to_blur_backward + 1;
+ break;
+
+ case 2:
+ /////////////////////////////////////////
+ // Forward Blur
+
+ frames_to_blur_forward = num_frames_forward;
+
+ if (frames_to_blur_forward >= max_frames)
+ frames_to_blur_forward = max_frames - 1;
+
+ frames_to_blur = frames_to_blur_forward + 1;
+ break;
+
+ case 3:
+ /////////////////////////////////////////
+ // Center Blur
+ frames_to_blur_forward = num_frames_forward;
+ frames_to_blur_backward = num_frames_backward;
+
+ if (frames_to_blur_forward > frames_to_blur_backward)
+ frames_to_blur_forward = frames_to_blur_backward;
+
+ if (frames_to_blur_backward > frames_to_blur_forward)
+ frames_to_blur_backward = frames_to_blur_forward;
+
+ if (frames_to_blur_forward > (max_frames / 2))
+ frames_to_blur_forward = (max_frames / 2);
+
+ if (frames_to_blur_backward > (max_frames / 2))
+ frames_to_blur_backward = (max_frames / 2);
+
+ frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+ break;
+
+ default:
+ /////////////////////////////////////////
+ // At most 4 frames forward Blur
+ frames_to_blur_forward = 4;
+ frames_to_blur_backward = num_frames_backward;
+
+ if (max_frames > 5)
+ {
+ if ((frames_to_blur_backward + frames_to_blur_forward) >= max_frames)
+ {
+ frames_to_blur_backward = max_frames - frames_to_blur_forward - 1;
+ }
+ }
+ else
+ {
+ frames_to_blur_forward = max_frames - 1;
+ frames_to_blur_backward = 0;
+ }
+
+ frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+ break;
+ }
+
+ start_frame = (cpi->last_alt_ref_sei + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;
+
+#ifdef DEBUGFWG
+ // DEBUG FWG
+ printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
+ , max_frames
+ , num_frames_backward
+ , num_frames_forward
+ , frames_to_blur
+ , frames_to_blur_backward
+ , frames_to_blur_forward
+ , cpi->source_encode_index
+ , cpi->last_alt_ref_sei
+ , start_frame);
+#endif
+
+ for (frame = 0; frame < frames_to_blur; frame++)
+ {
+ int which_buffer = start_frame - frame;
+
+ if (which_buffer < 0)
+ which_buffer += cpi->oxcf.lag_in_frames;
+
+ cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.y_buffer;
+ }
+
+ temp_source_buffer = &cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer;
+
+ // Blur Y
+ vp8cx_temp_blur1_c(
+ cpi->frames,
+ frames_to_blur,
+ temp_source_buffer->y_buffer, // cpi->Source->y_buffer,
+ cpi->alt_ref_buffer.source_buffer.y_buffer, // cpi->Source->y_buffer,
+ temp_source_buffer->y_width,
+ temp_source_buffer->y_stride,
+ temp_source_buffer->y_height,
+ //temp_source_buffer->y_height * temp_source_buffer->y_stride,
+ strength,
+ fixed_divide,
+ cpi->fp_motion_map, 16);
+
+ for (frame = 0; frame < frames_to_blur; frame++)
+ {
+ int which_buffer = cpi->last_alt_ref_sei - frame;
+
+ if (which_buffer < 0)
+ which_buffer += cpi->oxcf.lag_in_frames;
+
+ cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.u_buffer;
+ }
+
+ // Blur U
+ vp8cx_temp_blur1_c(
+ cpi->frames,
+ frames_to_blur,
+ temp_source_buffer->u_buffer,
+ cpi->alt_ref_buffer.source_buffer.u_buffer, // cpi->Source->u_buffer,
+ temp_source_buffer->uv_width,
+ temp_source_buffer->uv_stride,
+ temp_source_buffer->uv_height,
+ //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
+ strength,
+ fixed_divide,
+ cpi->fp_motion_map, 8);
+
+ for (frame = 0; frame < frames_to_blur; frame++)
+ {
+ int which_buffer = cpi->last_alt_ref_sei - frame;
+
+ if (which_buffer < 0)
+ which_buffer += cpi->oxcf.lag_in_frames;
+
+ cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.v_buffer;
+ }
+
+ // Blur V
+ vp8cx_temp_blur1_c(
+ cpi->frames,
+ frames_to_blur,
+ temp_source_buffer->v_buffer,
+ cpi->alt_ref_buffer.source_buffer.v_buffer, // cpi->Source->v_buffer,
+ temp_source_buffer->uv_width,
+ temp_source_buffer->uv_stride,
+ //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
+ temp_source_buffer->uv_height,
+ strength,
+ fixed_divide,
+ cpi->fp_motion_map, 8);
+}
+#endif
+
+
+static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+ int Q;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+
+ int Loop = FALSE;
+ int loop_count;
+ int this_q;
+ int last_zbin_oq;
+
+ int q_low;
+ int q_high;
+ int zbin_oq_high;
+ int zbin_oq_low = 0;
+ int top_index;
+ int bottom_index;
+ VP8_COMMON *cm = &cpi->common;
+ int active_worst_qchanged = FALSE;
+
+ int overshoot_seen = FALSE;
+ int undershoot_seen = FALSE;
+ int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100;
+ int drop_mark75 = drop_mark * 2 / 3;
+ int drop_mark50 = drop_mark / 4;
+ int drop_mark25 = drop_mark / 8;
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp8_clear_system_state();
+
+ // Test code for segmentation of gf/arf (0,0)
+ //segmentation_test_function((VP8_PTR) cpi);
+
+ // For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 2)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->per_frame_bandwidth = cpi->gf_bits; // Per frame bit target for the alt ref frame
+ cpi->target_bandwidth = cpi->gf_bits * cpi->output_frame_rate; // per second target bitrate
+ }
+ }
+ else
+#endif
+ cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+
+ // Default turn off buffer to buffer copying
+ cm->copy_buffer_to_gf = 0;
+ cm->copy_buffer_to_arf = 0;
+
+ // Clear zbin over-quant value and mode boost values.
+ cpi->zbin_over_quant = 0;
+ cpi->zbin_mode_boost = 0;
+
+ // Enable mode based tweaking of the zbin
+ cpi->zbin_mode_boost_enabled = TRUE;
+
+ // Current default encoder behaviour for the altref sign bias
+ if (cpi->source_alt_ref_active)
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+ else
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+ // Check to see if a key frame is signalled
+ // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
+ if ((cm->current_video_frame == 0) ||
+ (cm->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+ {
+ // Key frame from VFW/auto-keyframe/first frame
+ cm->frame_type = KEY_FRAME;
+ }
+
+ // Set default state for segment and mode based loop filter update flags
+ cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+ // Set various flags etc to special state if it is a key frame
+ if (cm->frame_type == KEY_FRAME)
+ {
+ int i;
+
+ // If segmentation is enabled force a map update for key frames
+ if (cpi->mb.e_mbd.segmentation_enabled)
+ {
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+ }
+
+ // If mode or reference frame based loop filter deltas are enabled then force an update for key frames.
+ if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled)
+ {
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame
+ cpi->source_alt_ref_active = FALSE;
+
+ // Reset the RD threshold multipliers to default of * 1 (128)
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ cpi->rd_thresh_mult[i] = 128;
+ }
+ }
+
+ // Test code for segmentation
+ //if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0))
+ //if ( (cm->current_video_frame % 2) == 0 )
+ // enable_segmentation((VP8_PTR)cpi);
+ //else
+ // disable_segmentation((VP8_PTR)cpi);
+
+#if 0
+ // Experimental code for lagged compress and one pass
+ // Initialise one_pass GF frames stats
+ // Update stats used for GF selection
+ //if ( cpi->pass == 0 )
+ {
+ cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS;
+
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0;
+ }
+#endif
+
+ update_rd_ref_frame_probs(cpi);
+
+ if (cpi->drop_frames_allowed)
+ {
+ // The reset to decimation 0 is only done here for one pass.
+ // Once it is set two pass leaves decimation on till the next kf.
+ if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0))
+ cpi->decimation_factor --;
+
+ if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0)
+ cpi->decimation_factor = 1;
+
+ else if (cpi->buffer_level < drop_mark25 && (cpi->decimation_factor == 2 || cpi->decimation_factor == 3))
+ {
+ cpi->decimation_factor = 3;
+ }
+ else if (cpi->buffer_level < drop_mark50 && (cpi->decimation_factor == 1 || cpi->decimation_factor == 2))
+ {
+ cpi->decimation_factor = 2;
+ }
+ else if (cpi->buffer_level < drop_mark75 && (cpi->decimation_factor == 0 || cpi->decimation_factor == 1))
+ {
+ cpi->decimation_factor = 1;
+ }
+
+ //vpx_log("Encoder: Decimation Factor: %d \n",cpi->decimation_factor);
+ }
+
+ // The following decimates the frame rate according to a regular pattern (i.e. to 1/2 or 2/3 frame rate)
+ // This can be used to help prevent buffer under-run in CBR mode. Alternatively it might be desirable in
+ // some situations to drop frame rate but throw more bits at each frame.
+ //
+ // Note that dropping a key frame can be problematic if spatial resampling is also active
+ if (cpi->decimation_factor > 0)
+ {
+ switch (cpi->decimation_factor)
+ {
+ case 1:
+ cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
+ break;
+ case 2:
+ cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+ break;
+ case 3:
+ cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+ break;
+ }
+
+ // Note that we should not throw out a key frame (especially when spatial resampling is enabled).
+ if ((cm->frame_type == KEY_FRAME)) // && cpi->oxcf.allow_spatial_resampling )
+ {
+ cpi->decimation_count = cpi->decimation_factor;
+ }
+ else if (cpi->decimation_count > 0)
+ {
+ cpi->decimation_count --;
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+
+#if CONFIG_PSNR
+ cpi->count ++;
+#endif
+
+ cpi->buffer_level = cpi->bits_off_target;
+
+ return;
+ }
+ else
+ cpi->decimation_count = cpi->decimation_factor;
+ }
+
+ // Decide how big to make the frame
+ if (!pick_frame_size(cpi))
+ {
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+ return;
+ }
+
+ // Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
+ // This has a knock on effect on active best quality as well.
+ // For CBR if the buffer reaches its maximum level then we can no longer
+ // save up bits for later frames so we might as well use them up
+ // on the current frame.
+ if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
+ {
+ int Adjustment = cpi->active_worst_quality / 4; // Max adjustment is 1/4
+
+ if (Adjustment)
+ {
+ int buff_lvl_step;
+ int tmp_lvl = cpi->buffer_level;
+
+ if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
+ {
+ buff_lvl_step = (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level) / Adjustment;
+
+ if (buff_lvl_step)
+ {
+ Adjustment = (cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / buff_lvl_step;
+ cpi->active_worst_quality -= Adjustment;
+ }
+ }
+ else
+ {
+ cpi->active_worst_quality -= Adjustment;
+ }
+ }
+ }
+
+ // Set an active best quality and if necessary active worst quality
+ if (cpi->pass == 2 || (cm->current_video_frame > 150))
+ {
+ //if ( (cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame )
+ int Q;
+ int i;
+ int bpm_target;
+
+ Q = cpi->active_worst_quality;
+
+ if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+ {
+ vp8_clear_system_state();
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ // Where a gf overlays an existing arf then allow active max Q to drift to highest allowed value.
+ //if ( cpi->common.refresh_golden_frame && cpi->source_alt_ref_active )
+ //cpi->active_worst_quality = cpi->worst_quality;
+
+ if (cpi->avg_frame_qindex < cpi->active_worst_quality)
+ Q = cpi->avg_frame_qindex;
+
+ if (cpi->section_is_low_motion)
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 3 / 2) + 128)) / 64;
+ else if (cpi->section_is_fast_motion)
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 128)) / 64;
+ else
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 5 / 4) + 128)) / 64;
+ }
+ // KEY FRAMES
+ else
+ {
+ if (cpi->section_is_low_motion)
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 240)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127
+ else
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64;
+ }
+
+ for (i = Q; i > 0; i--)
+ {
+ if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
+ break;
+ }
+
+ cpi->active_best_quality = i;
+
+ // this entire section could be replaced by a look up table
+#if 0
+ {
+ int Q, best_q[128];
+
+ for (Q = 0; Q < 128; Q++)
+ {
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127
+
+ for (i = Q; i > 0; i--)
+ {
+ if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
+ break;
+ }
+
+ best_q[Q] = i;
+ }
+
+ Q += 0;
+ }
+#endif
+
+ }
+ else
+ {
+ vp8_clear_system_state();
+
+ //bpm_target = (vp8_bits_per_mb[cm->frame_type][Q]*(Q+128))/64; // Approx 2 to 4 where Q has the range 0-127
+ bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 192)) / 128; // Approx * 1.5 to 2.5 where Q has range 0-127
+
+ for (i = Q; i > 0; i--)
+ {
+ if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
+ break;
+ }
+
+ cpi->active_best_quality = i;
+ }
+
+ // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
+ // to prevent bits just going to waste.
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause
+ if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
+ cpi->active_best_quality = cpi->best_quality;
+
+ else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)
+ {
+ int Fraction = ((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128) / (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level);
+ int min_qadjustment = ((cpi->active_best_quality - cpi->best_quality) * Fraction) / 128;
+
+ cpi->active_best_quality -= min_qadjustment;
+ }
+
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ if (cpi->active_worst_quality > cpi->worst_quality)
+ cpi->active_worst_quality = cpi->worst_quality;
+
+ if (cpi->active_best_quality < cpi->best_quality)
+ cpi->active_best_quality = cpi->best_quality;
+ else if (cpi->active_best_quality > cpi->active_worst_quality)
+ cpi->active_best_quality = cpi->active_worst_quality;
+
+ // Determine initial Q to try
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+ last_zbin_oq = cpi->zbin_over_quant;
+
+ // Set highest allowed value for Zbin over quant
+ if (cm->frame_type == KEY_FRAME)
+ zbin_oq_high = 0; //ZBIN_OQ_MAX/16
+ else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
+ zbin_oq_high = 16;
+ else
+ zbin_oq_high = ZBIN_OQ_MAX;
+
+ // Setup background Q adjustment for error resilliant mode
+ if (cpi->cyclic_refresh_mode_enabled)
+ cyclic_background_refresh(cpi, Q, 0);
+
+ vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+ // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
+ bottom_index = cpi->active_best_quality;
+ top_index = cpi->active_worst_quality;
+
+ vp8_save_coding_context(cpi);
+
+ loop_count = 0;
+
+ q_low = cpi->best_quality;
+ q_high = cpi->worst_quality;
+
+
+ scale_and_extend_source(cpi->un_scaled_source, cpi);
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC
+
+ if (cpi->oxcf.noise_sensitivity > 0)
+ {
+ unsigned char *src;
+ int l = 0;
+
+ switch (cpi->oxcf.noise_sensitivity)
+ {
+ case 1:
+ l = 20;
+ break;
+ case 2:
+ l = 40;
+ break;
+ case 3:
+ l = 60;
+ break;
+ case 4:
+ l = 80;
+ break;
+ case 5:
+ l = 100;
+ break;
+ case 6:
+ l = 150;
+ break;
+ }
+
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_de_noise(cpi->Source, cpi->Source, l , 1, 0, RTCD(postproc));
+ cpi->ppi.frame = 0;
+ }
+ else
+ {
+ vp8_de_noise(cpi->Source, cpi->Source, l , 1, 0, RTCD(postproc));
+
+ src = cpi->Source->y_buffer;
+
+ if (cpi->Source->y_stride < 0)
+ {
+ src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+ }
+
+ //temp_filter(&cpi->ppi,src,src,
+ // cm->last_frame.y_width * cm->last_frame.y_height,
+ // cpi->oxcf.noise_sensitivity);
+ }
+ }
+
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+ vp8_write_yuv_frame(cpi->Source);
+#endif
+
+ do
+ {
+ vp8_clear_system_state(); //__asm emms;
+
+ /*
+ if(cpi->is_src_frame_alt_ref)
+ Q = 127;
+ */
+
+ set_quantizer(cpi, Q);
+ this_q = Q;
+
+ // setup skip prob for costing in mode/mv decision
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ cpi->prob_skip_false = cpi->base_skip_false_prob[Q];
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ if (cpi->last_skip_false_probs[2] != 0)
+ cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+
+ /*
+ if(cpi->last_skip_false_probs[2]!=0 && abs(Q- cpi->last_skip_probs_q[2])<=16 )
+ cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+ else if (cpi->last_skip_false_probs[2]!=0)
+ cpi->prob_skip_false = (cpi->last_skip_false_probs[2] + cpi->prob_skip_false ) / 2;
+ */
+ }
+ else if (cpi->common.refresh_golden_frame)
+ {
+ if (cpi->last_skip_false_probs[1] != 0)
+ cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+
+ /*
+ if(cpi->last_skip_false_probs[1]!=0 && abs(Q- cpi->last_skip_probs_q[1])<=16 )
+ cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+ else if (cpi->last_skip_false_probs[1]!=0)
+ cpi->prob_skip_false = (cpi->last_skip_false_probs[1] + cpi->prob_skip_false ) / 2;
+ */
+ }
+ else
+ {
+ if (cpi->last_skip_false_probs[0] != 0)
+ cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+
+ /*
+ if(cpi->last_skip_false_probs[0]!=0 && abs(Q- cpi->last_skip_probs_q[0])<=16 )
+ cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+ else if(cpi->last_skip_false_probs[0]!=0)
+ cpi->prob_skip_false = (cpi->last_skip_false_probs[0] + cpi->prob_skip_false ) / 2;
+ */
+ }
+
+ //as this is for cost estimate, let's make sure it does not go extreme eitehr way
+ if (cpi->prob_skip_false < 5)
+ cpi->prob_skip_false = 5;
+
+ if (cpi->prob_skip_false > 250)
+ cpi->prob_skip_false = 250;
+
+ if (cpi->is_src_frame_alt_ref)
+ cpi->prob_skip_false = 1;
+
+
+ }
+
+#if 0
+
+ if (cpi->pass != 1)
+ {
+ FILE *f = fopen("skip.stt", "a");
+ fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false);
+ fclose(f);
+ }
+
+#endif
+
+ }
+
+ if (cm->frame_type == KEY_FRAME)
+ vp8_setup_key_frame(cpi);
+
+ // transform / motion compensation build reconstruction frame
+
+ vp8_encode_frame(cpi);
+ cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
+ cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
+
+ vp8_clear_system_state(); //__asm emms;
+
+ // Test to see if the stats generated for this frame indicate that we should have coded a key frame
+ // (assuming that we didn't)!
+ if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
+ {
+ if (decide_key_frame(cpi))
+ {
+ vp8_calc_auto_iframe_target_size(cpi);
+
+ // Reset all our sizing numbers and recode
+ cm->frame_type = KEY_FRAME;
+
+ // Clear the Alt reference frame active flag when we have a key frame
+ cpi->source_alt_ref_active = FALSE;
+
+ // If segmentation is enabled force a map update for key frames
+ if (cpi->mb.e_mbd.segmentation_enabled)
+ {
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+ }
+
+ // If mode or reference frame based loop filter deltas are enabled then force an update for key frames.
+ if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled)
+ {
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+ }
+
+ vp8_restore_coding_context(cpi);
+
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+ q_low = cpi->best_quality;
+ q_high = cpi->worst_quality;
+
+ vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+ // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
+ bottom_index = cpi->active_best_quality;
+ top_index = cpi->active_worst_quality;
+
+
+ loop_count++;
+ Loop = TRUE;
+
+ resize_key_frame(cpi);
+ continue;
+ }
+ }
+
+ vp8_clear_system_state();
+
+ if (frame_over_shoot_limit == 0)
+ frame_over_shoot_limit = 1;
+
+ // Are we are overshooting and up against the limit of active max Q.
+ if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
+ (Q == cpi->active_worst_quality) &&
+ (cpi->active_worst_quality < cpi->worst_quality) &&
+ (cpi->projected_frame_size > frame_over_shoot_limit))
+ {
+ int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
+
+ // If so is there any scope for relaxing it
+ while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
+ {
+ cpi->active_worst_quality++;
+ top_index = cpi->active_worst_quality;
+ over_size_percent = (int)(over_size_percent * 0.96); // Assume 1 qstep = about 4% on frame size.
+ }
+
+ // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.
+ active_worst_qchanged = TRUE;
+ }
+ else
+ active_worst_qchanged = FALSE;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ // Is the projected frame size out of range and are we allowed to attempt to recode.
+ if (((cpi->sf.recode_loop == 1) ||
+ ((cpi->sf.recode_loop == 2) && (cm->refresh_golden_frame || (cm->frame_type == KEY_FRAME)))) &&
+ (((cpi->projected_frame_size > frame_over_shoot_limit) && (Q < top_index)) ||
+ //((cpi->projected_frame_size > frame_over_shoot_limit ) && (Q == top_index) && (cpi->zbin_over_quant < ZBIN_OQ_MAX)) ||
+ ((cpi->projected_frame_size < frame_under_shoot_limit) && (Q > bottom_index)))
+ )
+ {
+ int last_q = Q;
+ int Retries = 0;
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+ if (cpi->projected_frame_size > frame_over_shoot_limit)
+ {
+ //if ( cpi->zbin_over_quant == 0 )
+ q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
+
+ if (cpi->zbin_over_quant > 0) // If we are using over quant do the same for zbin_oq_low
+ zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+
+ //if ( undershoot_seen || (Q == MAXQ) )
+ if (undershoot_seen)
+ {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 1);
+
+ Q = (q_high + q_low + 1) / 2;
+
+ // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+ if (Q < MAXQ)
+ cpi->zbin_over_quant = 0;
+ else
+ {
+ zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+ cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+ }
+ }
+ else
+ {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 0);
+
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+ while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10))
+ {
+ vp8_update_rate_correction_factors(cpi, 0);
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+ Retries ++;
+ }
+ }
+
+ overshoot_seen = TRUE;
+ }
+ else
+ {
+ if (cpi->zbin_over_quant == 0)
+ q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
+ else // else lower zbin_oq_high
+ zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+
+ if (overshoot_seen)
+ {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 1);
+
+ Q = (q_high + q_low) / 2;
+
+ // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+ if (Q < MAXQ)
+ cpi->zbin_over_quant = 0;
+ else
+ cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+ }
+ else
+ {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 0);
+
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+ while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
+ {
+ vp8_update_rate_correction_factors(cpi, 0);
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+ Retries ++;
+ }
+ }
+
+ undershoot_seen = TRUE;
+ }
+
+ // Clamp Q to upper and lower limits:
+ if (Q > q_high)
+ Q = q_high;
+ else if (Q < q_low)
+ Q = q_low;
+
+ // Clamp cpi->zbin_over_quant
+ cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
+
+ //Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
+ Loop = ((Q != last_q)) ? TRUE : FALSE;
+ last_zbin_oq = cpi->zbin_over_quant;
+ }
+ else
+#endif
+ Loop = FALSE;
+
+ if (cpi->is_src_frame_alt_ref)
+ Loop = FALSE;
+
+ if (Loop == TRUE)
+ {
+ vp8_restore_coding_context(cpi);
+ loop_count++;
+#if CONFIG_PSNR
+ cpi->tot_recode_hits++;
+#endif
+ }
+ }
+ while (Loop == TRUE);
+
+#if 0
+ // Experimental code for lagged and one pass
+ // Update stats used for one pass GF selection
+ {
+ /*
+ int frames_so_far;
+ double frame_intra_error;
+ double frame_coded_error;
+ double frame_pcnt_inter;
+ double frame_pcnt_motion;
+ double frame_mvr;
+ double frame_mvr_abs;
+ double frame_mvc;
+ double frame_mvc_abs;
+ */
+
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0;
+ }
+#endif
+
+ // Update the GF useage maps.
+ // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
+ vp8_update_gf_useage_maps(cm, &cpi->mb.e_mbd);
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->refresh_last_frame = 1;
+
+ if (0)
+ {
+ FILE *f = fopen("gfactive.stt", "a");
+ fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+ fclose(f);
+ }
+
+ // For inter frames the current default behaviour is that when cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
+ // This is purely an encoder descision at present.
+ if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame)
+ cm->copy_buffer_to_arf = 2;
+ else
+ cm->copy_buffer_to_arf = 0;
+
+ if (cm->refresh_last_frame)
+ {
+ vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+ cm->frame_to_show = &cm->last_frame;
+ }
+ else
+ cm->frame_to_show = &cm->new_frame;
+
+
+
+ //#pragma omp parallel sections
+ {
+
+ //#pragma omp section
+ {
+
+ struct vpx_usec_timer timer;
+
+ vpx_usec_timer_start(&timer);
+
+ if (cpi->sf.auto_filter == 0)
+ vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+ else
+ vp8cx_pick_filter_level(cpi->Source, cpi);
+
+ vpx_usec_timer_mark(&timer);
+
+ cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+
+ if (cm->no_lpf)
+ cm->filter_level = 0;
+
+ if (cm->filter_level > 0)
+ {
+ vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+ vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+ if (cpi->oxcf.error_resilient_mode == 1)
+ {
+ cm->refresh_entropy_probs = 0;
+ }
+
+ }
+//#pragma omp section
+ {
+ // build the bitstream
+ vp8_pack_bitstream(cpi, dest, size);
+ }
+ }
+
+
+ // At this point the new frame has been encoded coded.
+ // If any buffer copy / swaping is signalled it should be done here.
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+ }
+ else // For non key frames
+ {
+ // Code to copy between reference buffers
+ if (cm->copy_buffer_to_arf)
+ {
+ if (cm->copy_buffer_to_arf == 1)
+ {
+ if (cm->refresh_last_frame)
+ // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+ vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
+ else
+ vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
+ }
+ else if (cm->copy_buffer_to_arf == 2)
+ vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
+ }
+
+ if (cm->copy_buffer_to_gf)
+ {
+ if (cm->copy_buffer_to_gf == 1)
+ {
+ if (cm->refresh_last_frame)
+ // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+ vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
+ else
+ vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+ }
+ else if (cm->copy_buffer_to_gf == 2)
+ vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
+ }
+ }
+
+ // Update rate control heuristics
+ cpi->total_byte_count += (*size);
+ cpi->projected_frame_size = (*size) << 3;
+
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 2);
+
+ cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_adjust_key_frame_context(cpi);
+ }
+
+ // Keep a record of ambient average Q.
+ if (cm->frame_type == KEY_FRAME)
+ cpi->avg_frame_qindex = cm->base_qindex;
+ else
+ cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+ // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+ if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)
+ {
+ cpi->ni_frames++;
+
+ // Calculate the average Q for normal inter frames (not key or GFU frames)
+ // This is used as a basis for setting active worst quality.
+ if (cpi->ni_frames > 150)
+ {
+ cpi->ni_tot_qi += Q;
+ cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+ }
+ // Early in the clip ... average the current frame Q value with the default
+ // entered by the user as a dampening measure
+ else
+ {
+ cpi->ni_tot_qi += Q;
+ cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
+ }
+
+ // If the average Q is higher than what was used in the last frame
+ // (after going through the recode loop to keep the frame size within range)
+ // then use the last frame value - 1.
+ // The -1 is designed to stop Q and hence the data rate, from progressively
+ // falling away during difficult sections, but at the same time reduce the number of
+ // itterations around the recode loop.
+ if (Q > cpi->ni_av_qi)
+ cpi->ni_av_qi = Q - 1;
+
+ }
+
+#if 0
+
+ // If the frame was massively oversize and we are below optimal buffer level drop next frame
+ if ((cpi->drop_frames_allowed) &&
+ (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ (cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
+ (cpi->projected_frame_size > (4 * cpi->this_frame_target)))
+ {
+ cpi->drop_frame = TRUE;
+ }
+
+#endif
+
+ // Set the count for maximum consequative dropped frames based upon the ratio of
+ // this frame size to the target average per frame bandwidth.
+ // (cpi->av_per_frame_bandwidth > 0) is just a sanity check to prevent / 0.
+ if (cpi->drop_frames_allowed && (cpi->av_per_frame_bandwidth > 0))
+ {
+ cpi->max_drop_count = cpi->projected_frame_size / cpi->av_per_frame_bandwidth;
+
+ if (cpi->max_drop_count > cpi->max_consec_dropped_frames)
+ cpi->max_drop_count = cpi->max_consec_dropped_frames;
+ }
+
+ // Update the buffer level variable.
+ if (cpi->common.refresh_alt_ref_frame)
+ cpi->bits_off_target -= cpi->projected_frame_size;
+ else
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+ // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
+ cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+ cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+ cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+ cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+ // Actual bits spent
+ cpi->total_actual_bits += cpi->projected_frame_size;
+
+ // Debug stats
+ cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+ cpi->buffer_level = cpi->bits_off_target;
+
+ // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+ if (cpi->kf_group_bits < 0)
+ cpi->kf_group_bits = 0 ;
+ }
+ else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+ {
+ cpi->gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+ if (cpi->gf_group_bits < 0)
+ cpi->gf_group_bits = 0 ;
+ }
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->last_skip_false_probs[2] = cpi->prob_skip_false;
+ cpi->last_skip_probs_q[2] = cm->base_qindex;
+ }
+ else if (cpi->common.refresh_golden_frame)
+ {
+ cpi->last_skip_false_probs[1] = cpi->prob_skip_false;
+ cpi->last_skip_probs_q[1] = cm->base_qindex;
+ }
+ else
+ {
+ cpi->last_skip_false_probs[0] = cpi->prob_skip_false;
+ cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+ //update the baseline
+ cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false;
+
+ }
+ }
+
+#if CONFIG_PSNR
+
+ if (0)
+ {
+ FILE *f = fopen("tmp.stt", "a");
+
+ vp8_clear_system_state(); //__asm emms;
+
+ if (cpi->total_coded_error_left != 0.0)
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left, (double)cpi->bits_left / cpi->total_coded_error_left, cpi->tot_recode_hits);
+ else
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left, cpi->tot_recode_hits);
+
+ fclose(f);
+
+ {
+ FILE *fmodes = fopen("Modes.stt", "a");
+ int i;
+
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, cm->frame_type, cm->refresh_golden_frame, cm->refresh_alt_ref_frame);
+
+ for (i = 0; i < MAX_MODES; i++)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+ fprintf(fmodes, "\n");
+
+ fclose(fmodes);
+ }
+ }
+
+#endif
+
+ // If this was a kf or Gf note the Q
+ if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+ cm->last_kf_gf_q = cm->base_qindex;
+
+ if (cm->refresh_golden_frame == 1)
+ cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+ else
+ cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+ if (cm->refresh_alt_ref_frame == 1)
+ cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+ else
+ cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+ if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
+ cpi->gold_is_last = 1;
+ else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+ cpi->gold_is_last = 0;
+
+ if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
+ cpi->alt_is_last = 1;
+ else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
+ cpi->alt_is_last = 0;
+
+ if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
+ cpi->gold_is_alt = 1;
+ else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+ cpi->gold_is_alt = 0;
+
+ cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+ if (cpi->gold_is_last)
+ cpi->ref_frame_flags &= !VP8_GOLD_FLAG;
+
+ if (cpi->alt_is_last)
+ cpi->ref_frame_flags &= !VP8_ALT_FLAG;
+
+ if (cpi->gold_is_alt)
+ cpi->ref_frame_flags &= !VP8_ALT_FLAG;
+
+
+ if (cpi->oxcf.error_resilient_mode)
+ {
+ // Is this an alternate reference update
+ if (cpi->common.refresh_alt_ref_frame)
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+
+ if (cpi->common.refresh_golden_frame)
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+ }
+ else
+ {
+ if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame)
+ // Update the alternate reference frame and stats as appropriate.
+ update_alt_ref_frame_and_stats(cpi);
+ else
+ // Update the Golden frame and golden frame and stats as appropriate.
+ update_golden_frame_and_stats(cpi);
+ }
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ // Tell the caller that the frame was coded as a key frame
+ *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+ // As this frame is a key frame the next defaults to an inter frame.
+ cm->frame_type = INTER_FRAME;
+
+ cpi->last_frame_percent_intra = 100;
+ }
+ else
+ {
+ *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+
+ cpi->last_frame_percent_intra = cpi->this_frame_percent_intra;
+ }
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+ cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+
+ // Dont increment frame counters if this was an altref buffer update not a real frame
+ if (cm->show_frame)
+ {
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+ }
+
+ // reset to normal state now that we are done.
+
+
+
+ if (0)
+ {
+ char filename[512];
+ FILE *recon_file;
+ sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+ recon_file = fopen(filename, "wb");
+ fwrite(cm->last_frame.buffer_alloc, cm->last_frame.frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+
+ // DEBUG
+ //vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show);
+
+
+}
+
+int vp8_is_gf_update_needed(VP8_PTR ptr)
+{
+ VP8_COMP *cpi = (VP8_COMP *) ptr;
+ int ret_val;
+
+ ret_val = cpi->gf_update_recommended;
+ cpi->gf_update_recommended = 0;
+
+ return ret_val;
+}
+
+void vp8_check_gf_quality(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+ int gf_active_pct = (100 * cm->gf_active_count) / (cm->mb_rows * cm->mb_cols);
+ int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
+ int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
+
+ // Gf refresh is not currently being signalled
+ if (cpi->gf_update_recommended == 0)
+ {
+ if (cpi->common.frames_since_golden > 7)
+ {
+ // Low use of gf
+ if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
+ {
+ // ...but last frame zero zero usage is reasonbable so a new gf might be appropriate
+ if (last_ref_zz_useage >= 25)
+ {
+ cpi->gf_bad_count ++;
+
+ if (cpi->gf_bad_count >= 8) // Check that the condition is stable
+ {
+ cpi->gf_update_recommended = 1;
+ cpi->gf_bad_count = 0;
+ }
+ }
+ else
+ cpi->gf_bad_count = 0; // Restart count as the background is not stable enough
+ }
+ else
+ cpi->gf_bad_count = 0; // Gf useage has picked up so reset count
+ }
+ }
+ // If the signal is set but has not been read should we cancel it.
+ else if (last_ref_zz_useage < 15)
+ {
+ cpi->gf_update_recommended = 0;
+ cpi->gf_bad_count = 0;
+ }
+
+#if 0
+
+ if (0)
+ {
+ FILE *f = fopen("gfneeded.stt", "a");
+ fprintf(f, "%10d %10d %10d %10d %10ld \n",
+ cm->current_video_frame,
+ cpi->common.frames_since_golden,
+ gf_active_pct, gf_ref_usage_pct,
+ cpi->gf_update_recommended);
+ fclose(f);
+ }
+
+#endif
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ vp8_second_pass(cpi);
+
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+ cpi->bits_left -= 8 * *size;
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ cpi->bits_left += (long long)(two_pass_min_rate / cpi->oxcf.frame_rate);
+}
+#endif
+
+//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+#if HAVE_ARMV7
+extern void vp8_push_neon(INT64 *store);
+extern void vp8_pop_neon(INT64 *store);
+static INT64 store_reg[8];
+#endif
+int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time)
+{
+ VP8_COMP *cpi = (VP8_COMP *) ptr;
+ VP8_COMMON *cm = &cpi->common;
+ struct vpx_usec_timer timer;
+
+ if (!cpi)
+ return -1;
+
+#if HAVE_ARMV7
+ vp8_push_neon(store_reg);
+#endif
+
+ vpx_usec_timer_start(&timer);
+
+ // no more room for frames;
+ if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames)
+ {
+#if HAVE_ARMV7
+ vp8_pop_neon(store_reg);
+#endif
+ return -1;
+ }
+
+ //printf("in-cpi->source_buffer_count: %d\n", cpi->source_buffer_count);
+
+ cm->clr_type = sd->clrtype;
+
+ // make a copy of the frame for use later...
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->oxcf.allow_lag)
+ {
+ int which_buffer = cpi->source_encode_index - 1;
+ SOURCE_SAMPLE *s;
+
+ if (which_buffer == -1)
+ which_buffer = cpi->oxcf.lag_in_frames - 1;
+
+ if (cpi->source_buffer_count < cpi->oxcf.lag_in_frames - 1)
+ which_buffer = cpi->source_buffer_count;
+
+ s = &cpi->src_buffer[which_buffer];
+
+ s->source_time_stamp = time_stamp;
+ s->source_end_time_stamp = end_time;
+ s->source_frame_flags = frame_flags;
+ vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
+
+ cpi->source_buffer_count ++;
+ }
+ else
+#endif
+ {
+ SOURCE_SAMPLE *s;
+ s = &cpi->src_buffer[0];
+ s->source_end_time_stamp = end_time;
+ s->source_time_stamp = time_stamp;
+ s->source_frame_flags = frame_flags;
+#if HAVE_ARMV7
+ vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);
+#else
+ vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
+#endif
+ cpi->source_buffer_count = 1;
+ }
+
+ vpx_usec_timer_mark(&timer);
+ cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+#if HAVE_ARMV7
+ vp8_pop_neon(store_reg);
+#endif
+
+ return 0;
+}
+int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush)
+{
+
+ VP8_COMP *cpi = (VP8_COMP *) ptr;
+ VP8_COMMON *cm = &cpi->common;
+ struct vpx_usec_timer tsctimer;
+ struct vpx_usec_timer ticktimer;
+ struct vpx_usec_timer cmptimer;
+
+ if (!cpi)
+ return -1;
+
+#if HAVE_ARMV7
+ vp8_push_neon(store_reg);
+#endif
+
+ vpx_usec_timer_start(&cmptimer);
+
+
+ // flush variable tells us that even though we have less than 10 frames
+ // in our buffer we need to start producing compressed frames.
+ // Probably because we are at the end of a file....
+ if ((cpi->source_buffer_count == cpi->oxcf.lag_in_frames && cpi->oxcf.lag_in_frames > 0)
+ || (!cpi->oxcf.allow_lag && cpi->source_buffer_count > 0)
+ || (flush && cpi->source_buffer_count > 0))
+ {
+
+ SOURCE_SAMPLE *s;
+
+ s = &cpi->src_buffer[cpi->source_encode_index];
+ cpi->source_time_stamp = s->source_time_stamp;
+ cpi->source_end_time_stamp = s->source_end_time_stamp;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ // Should we code an alternate reference frame
+ if (cpi->oxcf.error_resilient_mode == 0 &&
+ cpi->oxcf.play_alternate &&
+ cpi->source_alt_ref_pending &&
+ (cpi->frames_till_gf_update_due < cpi->source_buffer_count) &&
+ cpi->oxcf.lag_in_frames != 0)
+ {
+ cpi->last_alt_ref_sei = (cpi->source_encode_index + cpi->frames_till_gf_update_due) % cpi->oxcf.lag_in_frames;
+
+#if VP8_TEMPORAL_ALT_REF
+
+ if (cpi->oxcf.arnr_max_frames > 0)
+ {
+#if 0
+ // my attempt at a loop that tests the results of strength filter.
+ int start_frame = cpi->last_alt_ref_sei - 3;
+
+ int i, besti = -1, pastin = cpi->oxcf.arnr_strength;
+
+ int besterr;
+
+ if (start_frame < 0)
+ start_frame += cpi->oxcf.lag_in_frames;
+
+ besterr = vp8_calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
+ &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
+
+ for (i = 0; i < 7; i++)
+ {
+ int thiserr;
+ cpi->oxcf.arnr_strength = i;
+ vp8cx_temp_filter_c(cpi);
+
+ thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
+ &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
+
+ if (10 * thiserr < besterr * 8)
+ {
+ besterr = thiserr;
+ besti = i;
+ }
+ }
+
+ if (besti != -1)
+ {
+ cpi->oxcf.arnr_strength = besti;
+ vp8cx_temp_filter_c(cpi);
+ s = &cpi->alt_ref_buffer;
+
+ // FWG not sure if I need to copy this data for the Alt Ref frame
+ s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp;
+ s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp;
+ s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags;
+ }
+ else
+ s = &cpi->src_buffer[cpi->last_alt_ref_sei];
+
+#else
+ vp8cx_temp_filter_c(cpi);
+ s = &cpi->alt_ref_buffer;
+
+ // FWG not sure if I need to copy this data for the Alt Ref frame
+ s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp;
+ s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp;
+ s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags;
+
+#endif
+ }
+ else
+#endif
+ s = &cpi->src_buffer[cpi->last_alt_ref_sei];
+
+ cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+ cm->refresh_alt_ref_frame = 1;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 0;
+ cm->show_frame = 0;
+ cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag.
+ cpi->is_src_frame_alt_ref = 0;
+ }
+ else
+#endif
+ {
+ cm->show_frame = 1;
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->oxcf.allow_lag)
+ {
+ if (cpi->source_encode_index == cpi->last_alt_ref_sei)
+ {
+#if VP8_TEMPORAL_ALT_REF
+
+ if (cpi->oxcf.arnr_max_frames == 0)
+ {
+ cpi->is_src_frame_alt_ref = 1; // copy alt ref
+ }
+ else
+ {
+ cpi->is_src_frame_alt_ref = 0;
+ }
+
+#else
+ cpi->is_src_frame_alt_ref = 1;
+#endif
+ cpi->last_alt_ref_sei = -1;
+ }
+ else
+ cpi->is_src_frame_alt_ref = 0;
+
+ cpi->source_encode_index = (cpi->source_encode_index + 1) % cpi->oxcf.lag_in_frames;
+ }
+
+#endif
+ cpi->source_buffer_count--;
+ }
+
+ cpi->un_scaled_source = &s->source_buffer;
+ cpi->Source = &s->source_buffer;
+ cpi->source_frame_flags = s->source_frame_flags;
+
+ *time_stamp = cpi->source_time_stamp;
+ *time_end = cpi->source_end_time_stamp;
+ }
+ else
+ {
+ *size = 0;
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (flush && cpi->pass == 1 && !cpi->first_pass_done)
+ {
+ vp8_end_first_pass(cpi); /* get last stats packet */
+ cpi->first_pass_done = 1;
+ }
+
+#endif
+
+#if HAVE_ARMV7
+ vp8_pop_neon(store_reg);
+#endif
+ return -1;
+ }
+
+ *frame_flags = cpi->source_frame_flags;
+
+#if CONFIG_PSNR
+
+ if (cpi->source_time_stamp < cpi->first_time_stamp_ever)
+ cpi->first_time_stamp_ever = cpi->source_time_stamp;
+
+#endif
+
+ // adjust frame rates based on timestamps given
+ if (!cm->refresh_alt_ref_frame)
+ {
+ if (cpi->last_time_stamp_seen == 0)
+ {
+ double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp);
+
+ vp8_new_frame_rate(cpi, this_fps);
+ }
+ else
+ {
+ long long nanosecs = cpi->source_time_stamp - cpi->last_time_stamp_seen;
+ double this_fps = 10000000.000 / nanosecs;
+
+ vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8);
+
+ }
+
+ cpi->last_time_stamp_seen = cpi->source_time_stamp;
+ }
+
+ if (cpi->compressor_speed == 2)
+ {
+ vp8_check_gf_quality(cpi);
+ }
+
+ if (!cpi)
+ {
+#if HAVE_ARMV7
+ vp8_pop_neon(store_reg);
+#endif
+ return 0;
+ }
+
+ if (cpi->compressor_speed == 2)
+ {
+ vpx_usec_timer_start(&tsctimer);
+ vpx_usec_timer_start(&ticktimer);
+ }
+
+ // start with a 0 size frame
+ *size = 0;
+
+ // Clear down mmx registers
+ vp8_clear_system_state(); //__asm emms;
+
+ cm->frame_type = INTER_FRAME;
+ cm->frame_flags = *frame_flags;
+
+#if 0
+
+ if (cm->refresh_alt_ref_frame)
+ {
+ //cm->refresh_golden_frame = 1;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 0;
+ }
+ else
+ {
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ }
+
+#endif
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 1)
+ {
+ Pass1Encode(cpi, size, dest, frame_flags);
+ }
+ else if (cpi->pass == 2)
+ {
+ Pass2Encode(cpi, size, dest, frame_flags);
+ }
+ else
+#endif
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+
+ if (cpi->compressor_speed == 2)
+ {
+ unsigned int duration, duration2;
+ vpx_usec_timer_mark(&tsctimer);
+ vpx_usec_timer_mark(&ticktimer);
+
+ duration = vpx_usec_timer_elapsed(&ticktimer);
+ duration2 = (unsigned int)((double)duration / 2);
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cpi->avg_encode_time == 0)
+ cpi->avg_encode_time = duration;
+ else
+ cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3;
+ }
+
+ if (duration2)
+ {
+ //if(*frame_flags!=1)
+ {
+
+ if (cpi->avg_pick_mode_time == 0)
+ cpi->avg_pick_mode_time = duration2;
+ else
+ cpi->avg_pick_mode_time = (7 * cpi->avg_pick_mode_time + duration2) >> 3;
+ }
+ }
+
+ }
+
+ if (cm->refresh_entropy_probs == 0)
+ {
+ vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+ }
+
+ // if its a dropped frame honor the requests on subsequent frames
+ if (*size > 0)
+ {
+
+ // return to normal state
+ cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+ cm->refresh_entropy_probs = 1;
+ cm->refresh_alt_ref_frame = 0;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->frame_type = INTER_FRAME;
+
+ }
+
+ cpi->ready_for_new_frame = 1;
+
+ vpx_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+ if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+ generate_psnr_packet(cpi);
+
+#if CONFIG_PSNR
+
+ if (cpi->pass != 1)
+ {
+ cpi->bytes += *size;
+
+ if (cm->show_frame)
+ {
+
+ cpi->count ++;
+
+ if (cpi->b_calculate_psnr)
+ {
+ double y, u, v;
+ double sq_error;
+ double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);
+
+ cpi->total_y += y;
+ cpi->total_u += u;
+ cpi->total_v += v;
+ cpi->total_sq_error += sq_error;
+ cpi->total += frame_psnr;
+ {
+ double y2, u2, v2, frame_psnr2, frame_ssim2 = 0;
+ double weight = 0;
+
+ vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
+ vp8_clear_system_state();
+ frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);
+ frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);
+
+ cpi->summed_quality += frame_ssim2 * weight;
+ cpi->summed_weights += weight;
+
+ cpi->totalp_y += y2;
+ cpi->totalp_u += u2;
+ cpi->totalp_v += v2;
+ cpi->totalp += frame_psnr2;
+ cpi->total_sq_error2 += sq_error;
+
+ }
+ }
+
+ if (cpi->b_calculate_ssimg)
+ {
+ double y, u, v, frame_all;
+ frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+ cpi->total_ssimg_y += y;
+ cpi->total_ssimg_u += u;
+ cpi->total_ssimg_v += v;
+ cpi->total_ssimg_all += frame_all;
+ }
+
+ }
+ }
+
+#if 0
+
+ if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q)
+ {
+ skiptruecount += cpi->skip_true_count;
+ skipfalsecount += cpi->skip_false_count;
+ }
+
+#endif
+#if 0
+
+ if (cpi->pass != 1)
+ {
+ FILE *f = fopen("skip.stt", "a");
+ fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size);
+
+ if (cpi->is_src_frame_alt_ref == 1)
+ fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size);
+
+ fclose(f);
+ }
+
+#endif
+#endif
+
+#if HAVE_ARMV7
+ vp8_pop_neon(store_reg);
+#endif
+
+ return 0;
+}
+
+int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+{
+ VP8_COMP *cpi = (VP8_COMP *) comp;
+
+ if (cpi->common.refresh_alt_ref_frame)
+ return -1;
+ else
+ {
+ int ret;
+#if CONFIG_POSTPROC
+ ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags);
+#else
+
+ if (cpi->common.frame_to_show)
+ {
+ *dest = *cpi->common.frame_to_show;
+ dest->y_width = cpi->common.Width;
+ dest->y_height = cpi->common.Height;
+ dest->uv_height = cpi->common.Height / 2;
+ ret = 0;
+ }
+ else
+ {
+ ret = -1;
+ }
+
+#endif //!CONFIG_POSTPROC
+ vp8_clear_system_state();
+ return ret;
+ }
+}
+
+int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+{
+ VP8_COMP *cpi = (VP8_COMP *) comp;
+ signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+
+ if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+ return -1;
+
+ if (!map)
+ {
+ disable_segmentation((VP8_PTR)cpi);
+ return 0;
+ }
+
+ // Set the segmentation Map
+ set_segmentation_map((VP8_PTR)cpi, map);
+
+ // Activate segmentation.
+ enable_segmentation((VP8_PTR)cpi);
+
+ // Set up the quant segment data
+ feature_data[MB_LVL_ALT_Q][0] = delta_q[0];
+ feature_data[MB_LVL_ALT_Q][1] = delta_q[1];
+ feature_data[MB_LVL_ALT_Q][2] = delta_q[2];
+ feature_data[MB_LVL_ALT_Q][3] = delta_q[3];
+
+ // Set up the loop segment data s
+ feature_data[MB_LVL_ALT_LF][0] = delta_lf[0];
+ feature_data[MB_LVL_ALT_LF][1] = delta_lf[1];
+ feature_data[MB_LVL_ALT_LF][2] = delta_lf[2];
+ feature_data[MB_LVL_ALT_LF][3] = delta_lf[3];
+
+ cpi->segment_encode_breakout[0] = threshold[0];
+ cpi->segment_encode_breakout[1] = threshold[1];
+ cpi->segment_encode_breakout[2] = threshold[2];
+ cpi->segment_encode_breakout[3] = threshold[3];
+
+ // Initialise the feature data structure
+ // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
+ set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+ return 0;
+}
+
+int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols)
+{
+ VP8_COMP *cpi = (VP8_COMP *) comp;
+
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+ {
+ if (map)
+ {
+ vpx_memcpy(cpi->active_map, map, rows * cols);
+ cpi->active_map_enabled = 1;
+ }
+ else
+ cpi->active_map_enabled = 0;
+
+ return 0;
+ }
+ else
+ {
+ //cpi->active_map_enabled = 0;
+ return -1 ;
+ }
+}
+
+int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+{
+ VP8_COMP *cpi = (VP8_COMP *) comp;
+
+ if (horiz_mode >= NORMAL && horiz_mode <= ONETWO)
+ cpi->common.horiz_scale = horiz_mode;
+ else
+ return -1;
+
+ if (vert_mode >= NORMAL && vert_mode <= ONETWO)
+ cpi->common.vert_scale = vert_mode;
+ else
+ return -1;
+
+ return 0;
+}
+
+
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ int i, j;
+ int Total = 0;
+
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+ (void)rtcd;
+
+ // Loop through the Y plane raw and reconstruction data summing (square differences)
+ for (i = 0; i < source->y_height; i += 16)
+ {
+ for (j = 0; j < source->y_width; j += 16)
+ {
+ unsigned int sse;
+ Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ int i, j;
+ int Total = 0;
+
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+ (void)rtcd;
+
+ // Loop through the Y plane raw and reconstruction data summing (square differences)
+ for (i = 0; i < source->y_height; i += 16)
+ {
+ for (j = 0; j < source->y_width; j += 16)
+ {
+ unsigned int sse, sse2, sum2;
+ VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+
+ if (sse < 8096)
+ Total += sse;
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+
+int vp8_get_speed(VP8_PTR c)
+{
+ VP8_COMP *cpi = (VP8_COMP *) c;
+ return cpi->Speed;
+}
+int vp8_get_quantizer(VP8_PTR c)
+{
+ VP8_COMP *cpi = (VP8_COMP *) c;
+ return cpi->common.base_qindex;
+}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
new file mode 100644
index 000000000..29b120ed4
--- /dev/null
+++ b/vp8/encoder/onyx_int.h
@@ -0,0 +1,670 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_INT_H
+#define __INC_VP8_INT_H
+
+#include <stdio.h>
+#include "vpx_ports/config.h"
+#include "onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "onyxc_int.h"
+#include "preproc.h"
+#include "variance.h"
+#include "dct.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "entropy.h"
+#include "threading.h"
+#include "vpx_ports/mem.h"
+#include "vpx_codec/internal/vpx_codec_internal.h"
+#include "mcomp.h"
+
+#define INTRARDOPT
+//#define SPEEDSTATS 1
+#define MIN_GF_INTERVAL 4
+#define DEFAULT_GF_INTERVAL 7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+#define AF_THRESH 25
+#define AF_THRESH2 100
+#define ARF_DECAY_THRESH 12
+#define MAX_MODES 20
+
+#define MIN_THRESHMULT 32
+#define MAX_THRESHMULT 512
+
+#define GF_ZEROMV_ZBIN_BOOST 24
+#define ZBIN_OQ_MAX 192
+
+#define VP8_TEMPORAL_ALT_REF 1
+
+typedef struct
+{
+ int kf_indicated;
+ unsigned int frames_since_key;
+ unsigned int frames_since_golden;
+ int filter_level;
+ int frames_till_gf_update_due;
+ int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+ MV_CONTEXT mvc[2];
+ int mvcosts[2][MVvals+1];
+
+#ifdef MODE_STATS
+ // Stats
+ int y_modes[5];
+ int uv_modes[4];
+ int b_modes[10];
+ int inter_y_modes[10];
+ int inter_uv_modes[4];
+ int inter_b_modes[10];
+#endif
+
+ vp8_prob ymode_prob[4], uv_mode_prob[3]; /* interframe intra mode probs */
+ vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3]; /* keyframe "" */
+
+ int ymode_count[5], uv_mode_count[4]; /* intra MB type cts this frame */
+
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+ int this_frame_percent_intra;
+ int last_frame_percent_intra;
+
+
+} CODING_CONTEXT;
+
+typedef struct
+{
+ double frame;
+ double intra_error;
+ double coded_error;
+ double ssim_weighted_pred_err;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double duration;
+ double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct
+{
+ int frames_so_far;
+ double frame_intra_error;
+ double frame_coded_error;
+ double frame_pcnt_inter;
+ double frame_pcnt_motion;
+ double frame_mvr;
+ double frame_mvr_abs;
+ double frame_mvc;
+ double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+
+typedef enum
+{
+ THR_ZEROMV = 0,
+ THR_DC = 1,
+
+ THR_NEARESTMV = 2,
+ THR_NEARMV = 3,
+
+ THR_ZEROG = 4,
+ THR_NEARESTG = 5,
+
+ THR_ZEROA = 6,
+ THR_NEARESTA = 7,
+
+ THR_NEARG = 8,
+ THR_NEARA = 9,
+
+ THR_V_PRED = 10,
+ THR_H_PRED = 11,
+ THR_TM = 12,
+
+ THR_NEWMV = 13,
+ THR_NEWG = 14,
+ THR_NEWA = 15,
+
+ THR_SPLITMV = 16,
+ THR_SPLITG = 17,
+ THR_SPLITA = 18,
+
+ THR_B_PRED = 19,
+}
+THR_MODES;
+
+typedef enum
+{
+ DIAMOND = 0,
+ NSTEP = 1,
+ HEX = 2
+} SEARCH_METHODS;
+
+typedef struct
+{
+ int RD;
+ SEARCH_METHODS search_method;
+ int improved_quant;
+ int improved_dct;
+ int auto_filter;
+ int recode_loop;
+ int iterative_sub_pixel;
+ int half_pixel_search;
+ int quarter_pixel_search;
+ int thresh_mult[MAX_MODES];
+ int full_freq[2];
+ int min_fs_radius;
+ int max_fs_radius;
+ int max_step_search_steps;
+ int first_step;
+ int optimize_coefficients;
+
+} SPEED_FEATURES;
+
+typedef struct
+{
+ MACROBLOCK mb;
+ int mb_row;
+ TOKENEXTRA *tp;
+ int segment_counts[MAX_MB_SEGMENTS];
+ int totalrate;
+ int current_mb_col;
+} MB_ROW_COMP;
+
+typedef struct
+{
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct
+{
+ int ithread;
+ void *ptr1;
+ void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct
+{
+ int ithread;
+ void *ptr1;
+} LPFTHREAD_DATA;
+
+typedef struct
+{
+ INT64 source_time_stamp;
+ INT64 source_end_time_stamp;
+
+ DECLARE_ALIGNED(16, YV12_BUFFER_CONFIG, source_buffer);
+ unsigned int source_frame_flags;
+} SOURCE_SAMPLE;
+
+typedef struct VP8_ENCODER_RTCD
+{
+ VP8_COMMON_RTCD *common;
+ vp8_variance_rtcd_vtable_t variance;
+ vp8_fdct_rtcd_vtable_t fdct;
+ vp8_encodemb_rtcd_vtable_t encodemb;
+ vp8_quantize_rtcd_vtable_t quantize;
+ vp8_search_rtcd_vtable_t search;
+} VP8_ENCODER_RTCD;
+
+typedef struct
+{
+
+ DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][4][4]);
+ DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][4][4]);
+ DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][4][4]);
+
+ DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][4][4]);
+ DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][4][4]);
+ DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][4][4]);
+
+ DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][4][4]);
+ DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][4][4]);
+ DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][4][4]);
+
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+
+
+ MACROBLOCK mb;
+ VP8_COMMON common;
+ vp8_writer bc, bc2;
+ // bool_writer *bc2;
+
+ VP8_CONFIG oxcf;
+
+ YV12_BUFFER_CONFIG *Source;
+ YV12_BUFFER_CONFIG *un_scaled_source;
+ INT64 source_time_stamp;
+ INT64 source_end_time_stamp;
+ unsigned int source_frame_flags;
+ YV12_BUFFER_CONFIG scaled_source;
+
+ int source_buffer_count;
+ int source_encode_index;
+ int source_alt_ref_pending;
+ int source_alt_ref_active;
+
+ int last_alt_ref_sei;
+ int is_src_frame_alt_ref;
+
+ int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
+ int alt_is_last; // Alt reference frame same as last ( short circuit altref search)
+ int gold_is_alt; // don't do both alt and gold search ( just do gold).
+
+ //int refresh_alt_ref_frame;
+ SOURCE_SAMPLE src_buffer[MAX_LAG_BUFFERS];
+
+ YV12_BUFFER_CONFIG last_frame_uf;
+
+ char *Dest;
+
+ TOKENEXTRA *tok;
+ unsigned int tok_count;
+
+
+ unsigned int frames_since_key;
+ unsigned int key_frame_frequency;
+ unsigned int next_key;
+
+ unsigned int mode_check_freq[MAX_MODES];
+ unsigned int mode_test_hit_counts[MAX_MODES];
+ unsigned int mode_chosen_counts[MAX_MODES];
+ unsigned int mbs_tested_so_far;
+
+ unsigned int check_freq[2];
+ unsigned int do_full[2];
+
+ int rd_thresh_mult[MAX_MODES];
+ int rd_baseline_thresh[MAX_MODES];
+ int rd_threshes[MAX_MODES];
+ int mvcostbase;
+ int mvcostmultiplier;
+ int subseqblockweight;
+ int errthresh;
+
+#ifdef INTRARDOPT
+ int RDMULT;
+ int RDDIV ;
+
+ TOKENEXTRA *rdtok;
+ int intra_rd_opt;
+ vp8_writer rdbc;
+ int intra_mode_costs[10];
+#endif
+
+
+ CODING_CONTEXT coding_context;
+
+ // Rate targetting variables
+ long long prediction_error;
+ long long last_prediction_error;
+ long long intra_error;
+ long long last_intra_error;
+ long long last_auto_filter_prediction_error;
+
+#if 0
+ // Experimental RD code
+ long long frame_distortion;
+ long long last_frame_distortion;
+#endif
+
+ int last_mb_distortion;
+
+ int frames_since_auto_filter;
+
+ int this_frame_target;
+ int projected_frame_size;
+ int last_q[2]; // Separate values for Intra/Inter
+ int target_bits_per_mb;
+
+ double rate_correction_factor;
+ double key_frame_rate_correction_factor;
+ double gf_rate_correction_factor;
+ double est_max_qcorrection_factor;
+
+ int frames_till_gf_update_due; // Count down till next GF
+ int current_gf_interval; // GF interval chosen when we coded the last GF
+
+ int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative)
+
+ int gf_group_bits; // Projected Bits available for a group of frames including 1 GF or ARF
+ int gf_bits; // Bits for the golden frame or ARF - 2 pass only
+ int mid_gf_extra_bits; // A few extra bits for the frame half way between two gfs.
+
+ int kf_group_bits; // Projected total bits available for a key frame group of frames
+ int kf_group_error_left; // Error score of frames still to be coded in kf group
+ int kf_bits; // Bits for the key frame in a key frame group - 2 pass only
+
+ int non_gf_bitrate_adjustment; // Used in the few frames following a GF to recover the extra bits spent in that GF
+ int initial_gf_use; // percentage use of gf 2 frames after gf
+
+ int gf_group_error_left; // Remaining error from uncoded frames in a gf group. Two pass use only
+
+ int kf_overspend_bits; // Extra bits spent on key frames that need to be recovered on inter frames
+ int kf_bitrate_adjustment; // Current number of bit s to try and recover on each inter frame.
+ int max_gf_interval;
+ int baseline_gf_interval;
+ int gf_decay_rate;
+
+ INT64 key_frame_count;
+ INT64 tot_key_frame_bits;
+ int prior_key_frame_size[KEY_FRAME_CONTEXT];
+ int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+ int per_frame_bandwidth; // Current section per frame bandwidth target
+ int av_per_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation that should be used for any frame
+ int last_key_frame_size;
+ int intra_frame_target;
+ int inter_frame_target;
+ double output_frame_rate;
+ long long last_time_stamp_seen;
+ long long first_time_stamp_ever;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex;
+
+ int zbin_over_quant;
+ int zbin_mode_boost;
+ int zbin_mode_boost_enabled;
+
+ INT64 total_byte_count;
+
+ int buffered_mode;
+
+ int buffer_level;
+ int bits_off_target;
+
+ int rolling_target_bits;
+ int rolling_actual_bits;
+
+ int long_rolling_target_bits;
+ int long_rolling_actual_bits;
+
+ long long total_actual_bits;
+ int total_target_vs_actual; // debug stats
+
+ int worst_quality;
+ int active_worst_quality;
+ int best_quality;
+ int active_best_quality;
+
+ int drop_frames_allowed; // Are we permitted to drop frames?
+ int drop_frame; // Drop this frame?
+ int drop_count; // How many frames have we dropped?
+ int max_drop_count; // How many frames should we drop?
+ int max_consec_dropped_frames; // Limit number of consecutive frames that can be dropped.
+
+
+ int ymode_count [VP8_YMODES]; /* intra MB type cts this frame */
+ int uv_mode_count[VP8_UV_MODES]; /* intra MB type cts this frame */
+
+ unsigned int MVcount [2] [MVvals]; /* (row,col) MV cts this frame */
+
+ unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]; /* for this frame */
+ //DECLARE_ALIGNED(16, int, coef_counts_backup [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]); //not used any more
+ //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation
+ vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
+ unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1][2];
+
+ /* Second compressed data partition contains coefficient data. */
+
+ unsigned char *output_partition2;
+ size_t output_partition2size;
+
+ pre_proc_instance ppi;
+
+ int frames_to_key;
+ int gfu_boost;
+ int kf_boost;
+ int last_boost;
+ double total_error_left;
+ double total_intra_error_left;
+ double total_coded_error_left;
+ double start_tot_err_left;
+ double min_error;
+
+ double modified_total_error_left;
+ double avg_iiratio;
+
+ int target_bandwidth;
+ long long bits_left;
+ FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
+ FIRSTPASS_STATS *stats_in, *stats_in_end;
+ struct vpx_codec_pkt_list *output_pkt_list;
+ int first_pass_done;
+ unsigned char *fp_motion_map;
+ FILE *fp_motion_mapfile;
+ int fpmm_pos;
+
+#if 0
+ // Experimental code for lagged and one pass
+ ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+ int one_pass_frame_index;
+#endif
+
+ int decimation_factor;
+ int decimation_count;
+
+ // for real time encoding
+ int avg_encode_time; //microsecond
+ int avg_pick_mode_time; //microsecond
+ int Speed;
+ unsigned int cpu_freq; //Mhz
+ int compressor_speed;
+
+ int interquantizer;
+ int auto_gold;
+ int auto_adjust_gold_quantizer;
+ int goldquantizer;
+ int goldfreq;
+ int auto_adjust_key_quantizer;
+ int keyquantizer;
+ int auto_worst_q;
+ int filter_type;
+ int cpu_used;
+ int chroma_boost;
+ int horiz_scale;
+ int vert_scale;
+ int pass;
+
+
+ int prob_intra_coded;
+ int prob_last_coded;
+ int prob_gf_coded;
+ int prob_skip_false;
+ int last_skip_false_probs[3];
+ int last_skip_probs_q[3];
+ int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+ int this_frame_percent_intra;
+ int last_frame_percent_intra;
+
+ int last_key_frame_q;
+ int last_kffilt_lvl;
+
+ int ref_frame_flags;
+
+ int exp[512];
+
+ SPEED_FEATURES sf;
+ int error_bins[1024];
+
+ int inter_lvl;
+ int intra_lvl;
+ int motion_lvl;
+ int motion_speed;
+ int motion_var;
+ int next_iiratio;
+ int this_iiratio;
+ int this_frame_modified_error;
+
+ double norm_intra_err_per_mb;
+ double norm_inter_err_per_mb;
+ double norm_iidiff_per_mb;
+
+ int last_best_mode_index; // Record of mode index chosen for previous macro block.
+ int last_auto_filt_val;
+ int last_auto_filt_q;
+
+ // Data used for real time conferencing mode to help determine if it would be good to update the gf
+ int inter_zz_count;
+ int gf_bad_count;
+ int gf_update_recommended;
+ int skip_true_count;
+ int skip_false_count;
+
+ int alt_qcount;
+
+ int ready_for_new_frame;
+
+ unsigned char *segmentation_map;
+ signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; // Segment data (can be deltas or absolute values)
+ int segment_encode_breakout[MAX_MB_SEGMENTS]; // segment threashold for encode breakout
+
+ unsigned char *active_map;
+ unsigned int active_map_enabled;
+ // Video conferencing cyclic refresh mode flags etc
+ // This is a mode designed to clean up the background over time in live encoding scenarious. It uses segmentation
+ int cyclic_refresh_mode_enabled;
+ int cyclic_refresh_mode_max_mbs_perframe;
+ int cyclic_refresh_mode_index;
+ int cyclic_refresh_q;
+ signed char *cyclic_refresh_map;
+
+ // multithread data
+ int current_mb_col_main;
+ int processor_core_count;
+ int b_multi_threaded;
+ int encoding_thread_count;
+
+#if CONFIG_MULTITHREAD
+ pthread_t *h_encoding_thread;
+#endif
+ MB_ROW_COMP *mb_row_ei;
+ ENCODETHREAD_DATA *en_thread_data;
+
+#if CONFIG_MULTITHREAD
+ //events
+ sem_t *h_event_mbrencoding;
+ sem_t h_event_main;
+#endif
+
+ TOKENLIST *tplist;
+ // end of multithread data
+
+
+ fractional_mv_step_fp *find_fractional_mv_step;
+ vp8_full_search_fn_t full_search_sad;
+ vp8_diamond_search_fn_t diamond_search_sad;
+ vp8_variance_fn_ptr_t fn_ptr;
+ unsigned int time_receive_data;
+ unsigned int time_compress_data;
+ unsigned int time_pick_lpf;
+ unsigned int time_encode_mb_row;
+
+ unsigned int tempdata1;
+ unsigned int tempdata2;
+
+ int base_skip_false_prob[128];
+ unsigned int section_is_low_motion;
+ unsigned int section_benefits_from_aggresive_q;
+ unsigned int section_is_fast_motion;
+ unsigned int section_intra_rating;
+
+ double section_max_qfactor;
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+ VP8_ENCODER_RTCD rtcd;
+#endif
+#if VP8_TEMPORAL_ALT_REF
+ SOURCE_SAMPLE alt_ref_buffer;
+ unsigned char *frames[MAX_LAG_BUFFERS];
+ int fixed_divide[255];
+#endif
+
+#if CONFIG_PSNR
+ int count;
+ double total_y;
+ double total_u;
+ double total_v;
+ double total ;
+ double total_sq_error;
+ double totalp_y;
+ double totalp_u;
+ double totalp_v;
+ double totalp;
+ double total_sq_error2;
+ int bytes;
+ double summed_quality;
+ double summed_weights;
+ unsigned int tot_recode_hits;
+
+
+ double total_ssimg_y;
+ double total_ssimg_u;
+ double total_ssimg_v;
+ double total_ssimg_all;
+
+ int b_calculate_ssimg;
+#endif
+ int b_calculate_psnr;
+} VP8_COMP;
+
+void control_data_rate(VP8_COMP *cpi);
+
+void vp8_encode_frame(VP8_COMP *cpi);
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size);
+
+int rd_cost_intra_mb(MACROBLOCKD *x);
+
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
+
+void vp8_set_speed_features(VP8_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__,__LINE__);\
+ } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval);\
+ } while(0)
+#endif
+#endif
diff --git a/vp8/encoder/parms.cpp b/vp8/encoder/parms.cpp
new file mode 100644
index 000000000..66fdafb1a
--- /dev/null
+++ b/vp8/encoder/parms.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#if 0
+
+#include <map>
+#include <string>
+#include <fstream>
+extern "C"
+{
+ #include "onyx.h"
+}
+
+
+using namespace std;
+
+typedef map<string,int> Parms;
+
+#define ALLPARMS(O,DOTHIS) \
+ DOTHIS(O, interquantizer )\
+ DOTHIS(O, auto_gold )\
+ DOTHIS(O, auto_adjust_gold_quantizer )\
+ DOTHIS(O, goldquantizer )\
+ DOTHIS(O, goldfreq )\
+ DOTHIS(O, auto_key )\
+ DOTHIS(O, auto_adjust_key_quantizer )\
+ DOTHIS(O, keyquantizer )\
+ DOTHIS(O, keyfreq )\
+ DOTHIS(O, pass )\
+ DOTHIS(O, fixed_q )\
+ DOTHIS(O, target_bandwidth )\
+ DOTHIS(O, auto_worst_q )\
+ DOTHIS(O, worst_quality )\
+ DOTHIS(O, best_allowed_q )\
+ DOTHIS(O, end_usage )\
+ DOTHIS(O, starting_buffer_level )\
+ DOTHIS(O, optimal_buffer_level )\
+ DOTHIS(O, maximum_buffer_size )\
+ DOTHIS(O, under_shoot_pct )\
+ DOTHIS(O, allow_df )\
+ DOTHIS(O, drop_frames_water_mark )\
+ DOTHIS(O, max_allowed_datarate )\
+ DOTHIS(O, two_pass_vbrbias )\
+ DOTHIS(O, two_pass_vbrmin_section )\
+ DOTHIS(O, two_pass_vbrmax_section )\
+ DOTHIS(O, filter_type )\
+ DOTHIS(O, compressor_speed )\
+ DOTHIS(O, mbpitch_feature )\
+ DOTHIS(O, allow_spatial_resampling )\
+ DOTHIS(O, resample_down_water_mark )\
+ DOTHIS(O, resample_up_water_mark )\
+ DOTHIS(O, noise_sensitivity )\
+ DOTHIS(O, horiz_scale )\
+ DOTHIS(O, vert_scale )
+
+
+#define GET(O,V) O->V = x[#V];
+#define PUT(O,V) x[#V] = O->V;
+
+
+extern "C" void get_parms(VP8_CONFIG *ocf,char *filename)
+{
+
+ Parms x;
+ int value;
+ string variable;
+ string equal;
+
+ ifstream config_file(filename);
+
+ ALLPARMS(ocf, PUT);
+
+ // store all the parms in a map (really simple parsing)
+ while(!config_file.eof() && config_file.is_open())
+ {
+ config_file >> variable;
+ config_file >> equal;
+
+ if(equal != "=")
+ continue;
+
+ config_file >> value;
+
+ x[variable] = value;
+ }
+
+ ALLPARMS(ocf, GET);
+
+}
+
+#define PRINT(O,V) debug_file<<#V <<" = " << O->V <<"\n";
+extern "C" void print_parms(VP8_CONFIG *ocf,char *filename)
+{
+ ofstream debug_file(filename,ios_base::app);
+ ALLPARMS(ocf, PRINT);
+ debug_file << "=============================================="<<"\n";
+}
+
+#endif
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
new file mode 100644
index 000000000..d61e2ceda
--- /dev/null
+++ b/vp8/encoder/pickinter.c
@@ -0,0 +1,923 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_ports/config.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "entropymode.h"
+#include "pickinter.h"
+#include "findnearmv.h"
+#include "encodemb.h"
+#include "reconinter.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "g_common.h"
+#include "variance.h"
+#include "mcomp.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd);
+
+#ifdef SPEEDSTATS
+extern unsigned int cnt_pm;
+#endif
+
+extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+
+extern unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel);
+extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
+
+
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+ (void) b;
+ (void) d;
+ (void) ref_mv;
+ (void) error_per_bit;
+ (void) svf;
+ (void) vf;
+ (void) mvcost;
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
+ return 0;
+}
+
+
+static int get_inter_mbpred_error(MACROBLOCK *mb, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, unsigned int *sse)
+{
+
+ BLOCK *b = &mb->block[0];
+ BLOCKD *d = &mb->e_mbd.block[0];
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what = *(d->base_pre) + d->pre ;
+ int in_what_stride = d->pre_stride;
+ int xoffset = d->bmi.mv.as_mv.col & 7;
+ int yoffset = d->bmi.mv.as_mv.row & 7;
+
+ in_what += (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (xoffset | yoffset)
+ {
+ return svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+ }
+ else
+ {
+ return vf(what, what_stride, in_what, in_what_stride, sse);
+ }
+
+}
+
+unsigned int vp8_get16x16pred_error_c
+(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad
+)
+{
+ unsigned pred_error = 0;
+ int i, j;
+ int sum = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ int diff;
+
+ for (j = 0; j < 16; j++)
+ {
+ diff = src_ptr[j] - ref_ptr[j];
+ sum += diff;
+ pred_error += diff * diff;
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ pred_error -= sum * sum / 256;
+ return pred_error;
+}
+
+
+unsigned int vp8_get4x4sse_cs_c
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ int max_sad
+)
+{
+ int distortion = 0;
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int diff = src_ptr[c] - ref_ptr[c];
+ distortion += diff * diff;
+ }
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+
+ return distortion;
+}
+
+static int get_prediction_error(BLOCK *be, BLOCKD *b, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ unsigned char *sptr;
+ unsigned char *dptr;
+ sptr = (*(be->base_src) + be->src);
+ dptr = b->predictor;
+
+ return VARIANCE_INVOKE(rtcd, get4x4sse_cs)(sptr, be->src_stride, dptr, 16, 0x7fffffff);
+
+}
+
+static int pick_intra4x4block(
+ const VP8_ENCODER_RTCD *rtcd,
+ MACROBLOCK *x,
+ BLOCK *be,
+ BLOCKD *b,
+ B_PREDICTION_MODE *best_mode,
+ B_PREDICTION_MODE above,
+ B_PREDICTION_MODE left,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+
+ int *bestrate,
+ int *bestdistortion)
+{
+ B_PREDICTION_MODE mode;
+ int best_rd = INT_MAX; // 1<<30
+ int rate;
+ int distortion;
+ unsigned int *mode_costs;
+ (void) l;
+ (void) a;
+
+ if (x->e_mbd.frame_type == KEY_FRAME)
+ {
+ mode_costs = x->bmode_costs[above][left];
+ }
+ else
+ {
+ mode_costs = x->inter_bmode_costs;
+ }
+
+ for (mode = B_DC_PRED; mode <= B_HE_PRED /*B_HU_PRED*/; mode++)
+ {
+ int this_rd;
+
+ rate = mode_costs[mode];
+ vp8_predict_intra4x4(b, mode, b->predictor);
+ distortion = get_prediction_error(be, b, &rtcd->variance);
+ this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ *bestrate = rate;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ }
+ }
+
+ b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
+ vp8_encode_intra4x4block(rtcd, x, be, b, b->bmi.mode);
+ return best_rd;
+}
+
+
+int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int *Rate, int *best_dist)
+{
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ int i;
+ TEMP_CONTEXT t;
+ int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+ int error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, 0); // Rd estimate for the cost of the block prediction mode
+ int distortion = 0;
+
+ vp8_intra_prediction_down_copy(xd);
+ vp8_setup_temp_context(&t, xd->above_context[Y1CONTEXT], xd->left_context[Y1CONTEXT], 4);
+
+ for (i = 0; i < 16; i++)
+ {
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+ const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
+ const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+ int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+
+ error += pick_intra4x4block(rtcd,
+ mb, mb->block + i, xd->block + i, &best_mode, A, L,
+ t.a + vp8_block2above[i],
+ t.l + vp8_block2left[i], &r, &d);
+
+ cost += r;
+ distortion += d;
+
+ mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+
+ // Break out case where we have already exceeded best so far value that was bassed in
+ if (distortion > *best_dist)
+ break;
+ }
+
+ for (i = 0; i < 16; i++)
+ xd->block[i].bmi.mv.as_int = 0;
+
+ *Rate = cost;
+
+ if (i == 16)
+ *best_dist = distortion;
+ else
+ *best_dist = INT_MAX;
+
+
+ return error;
+}
+
+int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)
+{
+
+ MACROBLOCKD *x = &mb->e_mbd;
+ unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+ unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+ unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src);
+ unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src);
+ int uvsrc_stride = mb->block[16].src_stride;
+ unsigned char uleft_col[8];
+ unsigned char vleft_col[8];
+ unsigned char utop_left = uabove_row[-1];
+ unsigned char vtop_left = vabove_row[-1];
+ int i, j;
+ int expected_udc;
+ int expected_vdc;
+ int shift;
+ int Uaverage = 0;
+ int Vaverage = 0;
+ int diff;
+ int pred_error[4] = {0, 0, 0, 0}, best_error = INT_MAX;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+
+
+ for (i = 0; i < 8; i++)
+ {
+ uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+ vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+ }
+
+ if (!x->up_available && !x->left_available)
+ {
+ expected_udc = 128;
+ expected_vdc = 128;
+ }
+ else
+ {
+ shift = 2;
+
+ if (x->up_available)
+ {
+
+ for (i = 0; i < 8; i++)
+ {
+ Uaverage += uabove_row[i];
+ Vaverage += vabove_row[i];
+ }
+
+ shift ++;
+
+ }
+
+ if (x->left_available)
+ {
+ for (i = 0; i < 8; i++)
+ {
+ Uaverage += uleft_col[i];
+ Vaverage += vleft_col[i];
+ }
+
+ shift ++;
+
+ }
+
+ expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+ expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+ }
+
+
+ for (i = 0; i < 8; i++)
+ {
+ for (j = 0; j < 8; j++)
+ {
+
+ int predu = uleft_col[i] + uabove_row[j] - utop_left;
+ int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+ int u_p, v_p;
+
+ u_p = usrc_ptr[j];
+ v_p = vsrc_ptr[j];
+
+ if (predu < 0)
+ predu = 0;
+
+ if (predu > 255)
+ predu = 255;
+
+ if (predv < 0)
+ predv = 0;
+
+ if (predv > 255)
+ predv = 255;
+
+
+ diff = u_p - expected_udc;
+ pred_error[DC_PRED] += diff * diff;
+ diff = v_p - expected_vdc;
+ pred_error[DC_PRED] += diff * diff;
+
+
+ diff = u_p - uabove_row[j];
+ pred_error[V_PRED] += diff * diff;
+ diff = v_p - vabove_row[j];
+ pred_error[V_PRED] += diff * diff;
+
+
+ diff = u_p - uleft_col[i];
+ pred_error[H_PRED] += diff * diff;
+ diff = v_p - vleft_col[i];
+ pred_error[H_PRED] += diff * diff;
+
+
+ diff = u_p - predu;
+ pred_error[TM_PRED] += diff * diff;
+ diff = v_p - predv;
+ pred_error[TM_PRED] += diff * diff;
+
+
+ }
+
+ usrc_ptr += uvsrc_stride;
+ vsrc_ptr += uvsrc_stride;
+
+ if (i == 3)
+ {
+ usrc_ptr = (mb->block[18].src + *mb->block[18].base_src);
+ vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src);
+ }
+
+
+
+ }
+
+
+ for (i = DC_PRED; i <= TM_PRED; i++)
+ {
+ if (best_error > pred_error[i])
+ {
+ best_error = pred_error[i];
+ best_mode = (MB_PREDICTION_MODE)i;
+ }
+ }
+
+
+ mb->e_mbd.mbmi.uv_mode = best_mode;
+ return best_error;
+
+}
+
+
+int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+{
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+ B_MODE_INFO best_bmodes[16];
+ MB_MODE_INFO best_mbmode;
+ MV best_ref_mv1;
+ MV mode_mv[MB_MODE_COUNT];
+ MB_PREDICTION_MODE this_mode;
+ int num00;
+ int i;
+ int mdcounts[4];
+ int best_rd = INT_MAX; // 1 << 30;
+ int best_intra_rd = INT_MAX;
+ int mode_index;
+ int ref_frame_cost[MAX_REF_FRAMES];
+ int rate;
+ int rate2;
+ int distortion2;
+ int bestsme;
+ //int all_rds[MAX_MODES]; // Experimental debug code.
+ int best_mode_index = 0;
+ int sse = INT_MAX;
+
+ MV nearest_mv[4];
+ MV near_mv[4];
+ MV best_ref_mv[4];
+ int MDCounts[4][4];
+ unsigned char *y_buffer[4];
+ unsigned char *u_buffer[4];
+ unsigned char *v_buffer[4];
+
+ int skip_mode[4] = {0, 0, 0, 0};
+
+ vpx_memset(mode_mv, 0, sizeof(mode_mv));
+ vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
+ vpx_memset(near_mv, 0, sizeof(near_mv));
+
+
+ // set up all the refframe dependent pointers.
+ if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+ {
+ vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME],
+ &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+
+ y_buffer[LAST_FRAME] = cpi->common.last_frame.y_buffer + recon_yoffset;
+ u_buffer[LAST_FRAME] = cpi->common.last_frame.u_buffer + recon_uvoffset;
+ v_buffer[LAST_FRAME] = cpi->common.last_frame.v_buffer + recon_uvoffset;
+ }
+ else
+ skip_mode[LAST_FRAME] = 1;
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+ {
+ vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME],
+ &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+
+ y_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.y_buffer + recon_yoffset;
+ u_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.u_buffer + recon_uvoffset;
+ v_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.v_buffer + recon_uvoffset;
+ }
+ else
+ skip_mode[GOLDEN_FRAME] = 1;
+
+ if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active)
+ {
+ vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME],
+ &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
+
+ y_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.y_buffer + recon_yoffset;
+ u_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset;
+ v_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset;
+ }
+ else
+ skip_mode[ALTREF_FRAME] = 1;
+
+ cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame
+
+ *returnintra = best_intra_rd;
+ x->skip = 0;
+
+ ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded);
+
+ // Special case treatment when GF and ARF are not sensible options for reference
+ if (cpi->ref_frame_flags == VP8_LAST_FLAG)
+ {
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(255);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(255)
+ + vp8_cost_zero(128);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(255)
+ + vp8_cost_one(128);
+ }
+ else
+ {
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(cpi->prob_last_coded);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(cpi->prob_last_coded)
+ + vp8_cost_zero(cpi->prob_gf_coded);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(cpi->prob_last_coded)
+ + vp8_cost_one(cpi->prob_gf_coded);
+ }
+
+
+
+ best_rd = INT_MAX;
+
+ x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+
+ // if we encode a new mv this is important
+ // find the best new motion vector
+ for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+ {
+ int frame_cost;
+ int this_rd = INT_MAX;
+
+ if (best_rd <= cpi->rd_threshes[mode_index])
+ continue;
+
+ x->e_mbd.mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+
+ if (skip_mode[x->e_mbd.mbmi.ref_frame])
+ continue;
+
+ // Check to see if the testing frequency for this mode is at its max
+ // If so then prevent it from being tested and increase the threshold for its testing
+ if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+ {
+ //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] )
+ if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index]))
+ {
+ // Increase the threshold for coding this mode to make it less likely to be chosen
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+
+ continue;
+ }
+ }
+
+ // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+ cpi->mode_test_hit_counts[mode_index] ++;
+
+ rate2 = 0;
+ distortion2 = 0;
+
+ this_mode = vp8_mode_order[mode_index];
+
+ // Experimental debug code.
+ //all_rds[mode_index] = -1;
+
+ x->e_mbd.mbmi.mode = this_mode;
+ x->e_mbd.mbmi.uv_mode = DC_PRED;
+
+ // Work out the cost assosciated with selecting the reference frame
+ frame_cost = ref_frame_cost[x->e_mbd.mbmi.ref_frame];
+ rate2 += frame_cost;
+
+ // everything but intra
+ if (x->e_mbd.mbmi.ref_frame)
+ {
+ x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mbmi.ref_frame];
+ x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mbmi.ref_frame];
+ x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mbmi.ref_frame];
+ mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mbmi.ref_frame];
+ mode_mv[NEARMV] = near_mv[x->e_mbd.mbmi.ref_frame];
+ best_ref_mv1 = best_ref_mv[x->e_mbd.mbmi.ref_frame];
+ memcpy(mdcounts, MDCounts[x->e_mbd.mbmi.ref_frame], sizeof(mdcounts));
+ }
+
+ //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
+ if (cpi->is_src_frame_alt_ref)
+ {
+ if (this_mode != ZEROMV || x->e_mbd.mbmi.ref_frame != ALTREF_FRAME)
+ continue;
+ }
+
+ switch (this_mode)
+ {
+ case B_PRED:
+ distortion2 = *returndistortion; // Best so far passed in as breakout value to vp8_pick_intra4x4mby_modes
+ vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
+ rate2 += rate;
+ distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+
+ if (distortion2 == INT_MAX)
+ {
+ this_rd = INT_MAX;
+ }
+ else
+ {
+ this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (this_rd < best_intra_rd)
+ {
+ best_intra_rd = this_rd;
+ *returnintra = best_intra_rd ;
+ }
+ }
+
+ break;
+
+ case SPLITMV:
+
+ // Split MV modes currently not supported when RD is nopt enabled.
+ break;
+
+ case DC_PRED:
+ case V_PRED:
+ case H_PRED:
+ case TM_PRED:
+ vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+ rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode];
+ this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (this_rd < best_intra_rd)
+ {
+ best_intra_rd = this_rd;
+ *returnintra = best_intra_rd ;
+ }
+
+ break;
+
+ case NEWMV:
+ {
+ int thissme;
+ int step_param;
+ int further_steps;
+ int n = 0;
+ int sadpb = x->sadperbit16;
+
+ // Further step/diamond searches as necessary
+ if (cpi->Speed < 8)
+ {
+ step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
+ further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+ }
+ else
+ {
+ step_param = cpi->sf.first_step + 2;
+ further_steps = 0;
+ }
+
+#if 0
+
+ // Initial step Search
+ bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+ // Further step searches
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+ }
+ else
+ {
+ d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+ d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+ }
+ }
+ }
+
+#else
+
+ if (cpi->sf.search_method == HEX)
+ {
+ bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+ }
+ else
+ {
+ bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+ // Further step/diamond searches as necessary
+ n = 0;
+ //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+ }
+ else
+ {
+ d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+ d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+ }
+ }
+ }
+ }
+
+#endif
+ }
+
+ if (bestsme < INT_MAX)
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, cpi->fn_ptr.svf, cpi->fn_ptr.vf, cpi->mb.mvcost);
+
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+ // mv cost;
+ rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128);
+
+
+ case NEARESTMV:
+ case NEARMV:
+
+ if (mode_mv[this_mode].row == 0 && mode_mv[this_mode].col == 0)
+ continue;
+
+ case ZEROMV:
+
+ // Trap vectors that reach beyond the UMV borders
+ // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+ // because of the lack of break statements in the previous two cases.
+ if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max))
+ continue;
+
+ rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+ x->e_mbd.mbmi.mode = this_mode;
+ x->e_mbd.mbmi.mv.as_mv = mode_mv[this_mode];
+ x->e_mbd.block[0].bmi.mode = this_mode;
+ x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mbmi.mv.as_int;
+
+ distortion2 = get_inter_mbpred_error(x, cpi->fn_ptr.svf, cpi->fn_ptr.vf, (unsigned int *)(&sse));
+
+ this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+ {
+ x->skip = 1;
+ }
+ else if (sse < x->encode_breakout)
+ {
+ // Check u and v to make sure skip is ok
+ int sse2 = 0;
+
+ sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
+
+ if (sse2 * 2 < x->encode_breakout)
+ x->skip = 1;
+ else
+ x->skip = 0;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ // Experimental debug code.
+ //all_rds[mode_index] = this_rd;
+
+ if (this_rd < best_rd || x->skip)
+ {
+ // Note index of best mode
+ best_mode_index = mode_index;
+
+ *returnrate = rate2;
+ *returndistortion = distortion2;
+ best_rd = this_rd;
+ vpx_memcpy(&best_mbmode, &x->e_mbd.mbmi, sizeof(MB_MODE_INFO));
+
+ if (this_mode == B_PRED || this_mode == SPLITMV)
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memcpy(&best_bmodes[i], &x->e_mbd.block[i].bmi, sizeof(B_MODE_INFO));
+ }
+ else
+ {
+ best_bmodes[0].mv = x->e_mbd.block[0].bmi.mv;
+ }
+
+ // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+ cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+ else
+ {
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ if (x->skip)
+ break;
+ }
+
+ // Reduce the activation RD thresholds for the best choice mode
+ if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+ {
+ int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
+
+ cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+ }
+
+ // Keep a record of best mode index for use in next loop
+ cpi->last_best_mode_index = best_mode_index;
+
+ if (best_mbmode.mode <= B_PRED)
+ {
+ x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+ vp8_pick_intra_mbuv_mode(x);
+ best_mbmode.uv_mode = x->e_mbd.mbmi.uv_mode;
+ }
+
+
+ {
+ int this_rdbin = (*returndistortion >> 7);
+
+ if (this_rdbin >= 1024)
+ {
+ this_rdbin = 1023;
+ }
+
+ cpi->error_bins[this_rdbin] ++;
+ }
+
+
+ if (cpi->is_src_frame_alt_ref && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+ {
+ best_mbmode.mode = ZEROMV;
+ best_mbmode.ref_frame = ALTREF_FRAME;
+ best_mbmode.mv.as_int = 0;
+ best_mbmode.uv_mode = 0;
+ best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+ best_mbmode.partitioning = 0;
+ best_mbmode.dc_diff = 0;
+
+ vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
+ }
+
+ x->e_mbd.mbmi.mv.as_int = 0;
+
+ return best_rd;
+ }
+
+
+ // macroblock modes
+ vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+ if (x->e_mbd.mbmi.mode == B_PRED || x->e_mbd.mbmi.mode == SPLITMV)
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memcpy(&x->e_mbd.block[i].bmi, &best_bmodes[i], sizeof(B_MODE_INFO));
+
+ }
+ else
+ {
+ vp8_set_mbmode_and_mvs(x, x->e_mbd.mbmi.mode, &best_bmodes[0].mv.as_mv);
+ }
+
+ x->e_mbd.mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv;
+
+ return best_rd;
+}
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
new file mode 100644
index 000000000..fb28837ed
--- /dev/null
+++ b/vp8/encoder/pickinter.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PICKINTER_H
+#define __INC_PICKINTER_H
+#include "vpx_ports/config.h"
+#include "onyxc_int.h"
+
+#define RD_ESTIMATE(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion);
+extern int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb);
+extern int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+#endif
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
new file mode 100644
index 000000000..bbd7840b8
--- /dev/null
+++ b/vp8/encoder/picklpf.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "alloccommon.h"
+
+extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val);
+extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val, int sharpness_lvl);
+extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void
+(*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc,
+ int Fraction);
+void
+vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
+{
+ unsigned char *src_y, *dst_y;
+ int yheight;
+ int ystride;
+ int border;
+ int yoffset;
+ int linestocopy;
+
+ border = src_ybc->border;
+ yheight = src_ybc->y_height;
+ ystride = src_ybc->y_stride;
+
+ linestocopy = (yheight >> (Fraction + 4));
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+ yoffset = ystride * ((yheight >> 5) * 16 - 8);
+ src_y = src_ybc->y_buffer + yoffset;
+ dst_y = dst_ybc->y_buffer + yoffset;
+
+ vpx_memcpy(dst_y, src_y, ystride *(linestocopy + 16));
+}
+
+static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int Fraction, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ int i, j;
+ int Total = 0;
+ int srcoffset, dstoffset;
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ int linestocopy = (source->y_height >> (Fraction + 4));
+ (void)rtcd;
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+
+ srcoffset = source->y_stride * (dest->y_height >> 5) * 16;
+ dstoffset = dest->y_stride * (dest->y_height >> 5) * 16;
+
+ src += srcoffset;
+ dst += dstoffset;
+
+ // Loop through the Y plane raw and reconstruction data summing (square differences)
+ for (i = 0; i < linestocopy; i += 16)
+ {
+ for (j = 0; j < source->y_width; j += 16)
+ {
+ unsigned int sse;
+ Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+
+extern void vp8_loop_filter_partial_frame
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl,
+ int sharpness_lvl,
+ int Fraction
+);
+
+// Enforce a minimum filter level based upon baseline Q
+static int get_min_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+ int min_filter_level;
+
+ if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
+ min_filter_level = 0;
+ else
+ {
+ if (base_qindex <= 6)
+ min_filter_level = 0;
+ else if (base_qindex <= 16)
+ min_filter_level = 1;
+ else
+ min_filter_level = (base_qindex / 8);
+ }
+
+ return min_filter_level;
+}
+
+// Enforce a maximum filter level based upon baseline Q
+static int get_max_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+ // PGW August 2006: Highest filter values almost always a bad idea
+
+ // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
+ // with lots of intra coming in.
+ int max_filter_level = MAX_LOOP_FILTER ;//* 3 / 4;
+
+ if (cpi->section_intra_rating > 8)
+ max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+ (void) cpi;
+ (void) base_qindex;
+
+ return max_filter_level;
+}
+
+void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int best_err = 0;
+ int filt_err = 0;
+ int min_filter_level = 0;
+ int max_filter_level = MAX_LOOP_FILTER * 3 / 4; // PGW August 2006: Highest filter values almost always a bad idea
+ int filt_val;
+ int best_filt_val = cm->filter_level;
+
+ // Make a copy of the unfiltered / processed recon buffer
+ //vp8_yv12_copy_frame_ptr( cm->frame_to_show, &cpi->last_frame_uf );
+ vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->sharpness_level = 0;
+ else
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ // Enforce a minimum filter level based upon Q
+ min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+ // Start the search at the previous frame filter level unless it is now out of range.
+ if (cm->filter_level < min_filter_level)
+ cm->filter_level = min_filter_level;
+ else if (cm->filter_level > max_filter_level)
+ cm->filter_level = max_filter_level;
+
+ filt_val = cm->filter_level;
+ best_filt_val = filt_val;
+
+ // Set up alternate filter values
+
+ // Get the err using the previous frame's filter value.
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0 , 3);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ best_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+
+ // Re-instate the unfiltered frame
+ vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+ filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+
+ // Search lower filter levels
+ while (filt_val >= min_filter_level)
+ {
+ // Apply the loop filter
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0, 3);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ // Get the err for filtered frame
+ filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+
+
+ // Re-instate the unfiltered frame
+ vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+
+ // Update the best case record or exit loop.
+ if (filt_err < best_err)
+ {
+ best_err = filt_err;
+ best_filt_val = filt_val;
+ }
+ else
+ break;
+
+ // Adjust filter level
+ filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+ }
+
+ // Search up (note that we have already done filt_val = cm->filter_level)
+ filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+
+ if (best_filt_val == cm->filter_level)
+ {
+ // Resist raising filter level for very small gains
+ best_err -= (best_err >> 10);
+
+ while (filt_val < max_filter_level)
+ {
+ // Apply the loop filter
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0, 3);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ // Get the err for filtered frame
+ filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+
+ // Re-instate the unfiltered frame
+ vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+ // Update the best case record or exit loop.
+ if (filt_err < best_err)
+ {
+ // Do not raise filter level if improvement is < 1 part in 4096
+ best_err = filt_err - (filt_err >> 10);
+
+ best_filt_val = filt_val;
+ }
+ else
+ break;
+
+ // Adjust filter level
+ filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+ }
+ }
+
+ cm->filter_level = best_filt_val;
+
+ if (cm->filter_level < min_filter_level)
+ cm->filter_level = min_filter_level;
+
+ if (cm->filter_level > max_filter_level)
+ cm->filter_level = max_filter_level;
+}
+
+// Stub function for now Alt LF not used
+void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val)
+{
+ MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+ (void) filt_val;
+
+ mbd->segment_feature_data[MB_LVL_ALT_LF][0] = cpi->segment_feature_data[MB_LVL_ALT_LF][0];
+ mbd->segment_feature_data[MB_LVL_ALT_LF][1] = cpi->segment_feature_data[MB_LVL_ALT_LF][1];
+ mbd->segment_feature_data[MB_LVL_ALT_LF][2] = cpi->segment_feature_data[MB_LVL_ALT_LF][2];
+ mbd->segment_feature_data[MB_LVL_ALT_LF][3] = cpi->segment_feature_data[MB_LVL_ALT_LF][3];
+}
+
+void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int best_err = 0;
+ int filt_err = 0;
+ int min_filter_level;
+ int max_filter_level;
+ int prediction_difference = (int)(100 * abs((int)(cpi->last_auto_filter_prediction_error - cpi->prediction_error)) / (1 + cpi->prediction_error));
+
+ int filter_step;
+ int filt_high = 0;
+ int filt_mid = cm->filter_level; // Start search at previous frame filter level
+ int filt_low = 0;
+ int filt_best;
+ int filt_direction = 0;
+
+ int Bias = 0; // Bias against raising loop filter and in favour of lowering it
+
+ // Make a copy of the unfiltered / processed recon buffer
+#if HAVE_ARMV7
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
+#else
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+#endif
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->sharpness_level = 0;
+ else
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ // Enforce a minimum filter level based upon Q
+ min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+ // Start the search at the previous frame filter level unless it is now out of range.
+ filt_mid = cm->filter_level;
+
+ if (filt_mid < min_filter_level)
+ filt_mid = min_filter_level;
+ else if (filt_mid > max_filter_level)
+ filt_mid = max_filter_level;
+
+ // Define the initial step size
+ filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+ // Get baseline error score
+ vp8cx_set_alt_lf_level(cpi, filt_mid);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid, 0);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+ filt_best = filt_mid;
+
+ // Re-instate the unfiltered frame
+#if HAVE_ARMV7
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+ vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+
+ while (filter_step > 0)
+ {
+ Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; //PGW change 12/12/06 for small images
+
+ // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
+ if (cpi->section_intra_rating < 20)
+ Bias = Bias * cpi->section_intra_rating / 20;
+
+ filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+ filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+ if ((filt_direction <= 0) && (filt_low != filt_mid))
+ {
+ // Get Low filter error score
+ vp8cx_set_alt_lf_level(cpi, filt_low);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low, 0);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+
+ // Re-instate the unfiltered frame
+#if HAVE_ARMV7
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+ vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+
+ // If value is close to the best so far then bias towards a lower loop filter value.
+ if ((filt_err - Bias) < best_err)
+ {
+ // Was it actually better than the previous best?
+ if (filt_err < best_err)
+ best_err = filt_err;
+
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if ((filt_direction >= 0) && (filt_high != filt_mid))
+ {
+ vp8cx_set_alt_lf_level(cpi, filt_high);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high, 0);
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+
+ // Re-instate the unfiltered frame
+#if HAVE_ARMV7
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+ vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+
+ // Was it better than the previous best?
+ if (filt_err < (best_err - Bias))
+ {
+ best_err = filt_err;
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid)
+ {
+ filter_step = filter_step / 2;
+ filt_direction = 0;
+ }
+ else
+ {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ cm->filter_level = filt_best;
+ cpi->last_auto_filt_val = filt_best;
+ cpi->last_auto_filt_q = cm->base_qindex;
+
+ cpi->last_auto_filter_prediction_error = cpi->prediction_error;
+ cpi->frames_since_auto_filter = 0;
+}
diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c
new file mode 100644
index 000000000..f99277f99
--- /dev/null
+++ b/vp8/encoder/ppc/csystemdependent.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *coeff, short *dqcoeff);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+
+unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// c imports
+extern int block_error_c(short *coeff, short *dqcoeff);
+extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
+
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern void short_fdct4x4_c(short *input, short *output, int pitch);
+extern void short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction sad16x16_c;
+extern SADFunction sad16x8_c;
+extern SADFunction sad8x16_c;
+extern SADFunction sad8x8_c;
+extern SADFunction sad4x4_c;
+
+extern variance_function variance16x16_c;
+extern variance_function variance8x16_c;
+extern variance_function variance16x8_c;
+extern variance_function variance8x8_c;
+extern variance_function variance4x4_c;
+extern variance_function mse16x16_c;
+
+extern sub_pixel_variance_function sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// ppc
+extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
+
+extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
+
+extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+
+extern SADFunction vp8_sad16x16_ppc;
+extern SADFunction vp8_sad16x8_ppc;
+extern SADFunction vp8_sad8x16_ppc;
+extern SADFunction vp8_sad8x8_ppc;
+extern SADFunction vp8_sad4x4_ppc;
+
+extern variance_function vp8_variance16x16_ppc;
+extern variance_function vp8_variance8x16_ppc;
+extern variance_function vp8_variance16x8_ppc;
+extern variance_function vp8_variance8x8_ppc;
+extern variance_function vp8_variance4x4_ppc;
+extern variance_function vp8_mse16x16_ppc;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
+
+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+
+void vp8_cmachine_specific_config(void)
+{
+ // Pure C:
+ vp8_mbuverror = vp8_mbuverror_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_ppc;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_ppc;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_ppc;
+ vp8_fast_fdct8x4 = vp8_short_fdct8x4_ppc;
+ short_walsh4x4 = vp8_short_walsh4x4_c;
+
+ vp8_variance4x4 = vp8_variance4x4_ppc;
+ vp8_variance8x8 = vp8_variance8x8_ppc;
+ vp8_variance8x16 = vp8_variance8x16_ppc;
+ vp8_variance16x8 = vp8_variance16x8_ppc;
+ vp8_variance16x16 = vp8_variance16x16_ppc;
+ vp8_mse16x16 = vp8_mse16x16_ppc;
+
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_ppc;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_ppc;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_ppc;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ppc;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc;
+
+ vp8_get_mb_ss = vp8_get_mb_ss_c;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_c;
+ vp8_get8x8var = vp8_get8x8var_ppc;
+ vp8_get16x16var = vp8_get16x16var_ppc;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
+
+ vp8_sad16x16 = vp8_sad16x16_ppc;
+ vp8_sad16x8 = vp8_sad16x8_ppc;
+ vp8_sad8x16 = vp8_sad8x16_ppc;
+ vp8_sad8x8 = vp8_sad8x8_ppc;
+ vp8_sad4x4 = vp8_sad4x4_ppc;
+
+ vp8_block_error = vp8_block_error_ppc;
+ vp8_mbblock_error = vp8_mbblock_error_c;
+
+ vp8_subtract_b = vp8_subtract_b_c;
+ vp8_subtract_mby = vp8_subtract_mby_ppc;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_ppc;
+}
diff --git a/vp8/encoder/ppc/encodemb_altivec.asm b/vp8/encoder/ppc/encodemb_altivec.asm
new file mode 100644
index 000000000..e0e976d71
--- /dev/null
+++ b/vp8/encoder/ppc/encodemb_altivec.asm
@@ -0,0 +1,152 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ .globl vp8_subtract_mbuv_ppc
+ .globl vp8_subtract_mby_ppc
+
+;# r3 short *diff
+;# r4 unsigned char *usrc
+;# r5 unsigned char *vsrc
+;# r6 unsigned char *pred
+;# r7 int stride
+vp8_subtract_mbuv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf000
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r9, 256
+ add r3, r3, r9
+ add r3, r3, r9
+ add r6, r6, r9
+
+ li r10, 16
+ li r9, 4
+ mtctr r9
+
+ vspltisw v0, 0
+
+mbu_loop:
+ lvsl v5, 0, r4 ;# permutate value for alignment
+ lvx v1, 0, r4 ;# src
+ lvx v2, 0, r6 ;# pred
+
+ add r4, r4, r7
+ addi r6, r6, 16
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ lvsl v5, 0, r4 ;# permutate value for alignment
+ lvx v1, 0, r4 ;# src
+
+ add r4, r4, r7
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrglb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mbu_loop
+
+ mtctr r9
+
+mbv_loop:
+ lvsl v5, 0, r5 ;# permutate value for alignment
+ lvx v1, 0, r5 ;# src
+ lvx v2, 0, r6 ;# pred
+
+ add r5, r5, r7
+ addi r6, r6, 16
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ lvsl v5, 0, r5 ;# permutate value for alignment
+ lvx v1, 0, r5 ;# src
+
+ add r5, r5, r7
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrglb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mbv_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# r3 short *diff
+;# r4 unsigned char *src
+;# r5 unsigned char *pred
+;# r6 int stride
+vp8_subtract_mby_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf800
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r10, 16
+ mtctr r10
+
+ vspltisw v0, 0
+
+mby_loop:
+ lvx v1, 0, r4 ;# src
+ lvx v2, 0, r5 ;# pred
+
+ add r4, r4, r6
+ addi r5, r5, 16
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vmrglb v3, v0, v1 ;# unpack low src to short
+ vmrglb v4, v0, v2 ;# unpack low pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mby_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm
new file mode 100644
index 000000000..eaab14c79
--- /dev/null
+++ b/vp8/encoder/ppc/fdct_altivec.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ .globl vp8_short_fdct4x4_ppc
+ .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;# in normalization (fwd is twice unitary, inv is half unitary)
+;# and that they are of course transposes of each other.
+;#
+;# The following three accomplish most of implementation and
+;# are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ li r6, 16
+
+ load_c v0, dct_tab, 0, r9, r10
+ lvx v1, r6, r10
+ addi r10, r10, 32
+ lvx v2, 0, r10
+ lvx v3, r6, r10
+
+ load_c v4, ppc_dctperm_tab, 0, r9, r10
+ load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+ load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
+;# a/A are the even rows 0,2 b/B are the odd rows 1,3
+;# For fwd transform, indices are horizontal positions, then frequencies.
+;# For inverse transform, frequencies then positions.
+;# The two resulting A0..A3 B0..B3 are later combined
+;# and vertically transformed.
+
+.macro two_rows_horiz Dst
+ vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
+
+ vmsumshm v10, v0, v8, v6
+ vmsumshm v10, v1, v9, v10
+ vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
+
+ vmsumshm v11, v2, v8, v6
+ vmsumshm v11, v3, v9, v11
+ vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
+
+ vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
+ vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;# forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+ vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
+ vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
+ vmsumshm v8, v8, v12, v6
+ vmsumshm v8, v9, v13, v8
+ vsraw v10, v8, v7
+
+ vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
+ vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
+ vmsumshm v8, v8, v12, v6
+ vmsumshm v8, v9, v13, v8
+ vsraw v8, v8, v7
+
+ vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
+.endm
+
+.macro two_rows_h Dest
+ stw r0, 0(r8)
+ lwz r0, 4(r3)
+ stw r0, 4(r8)
+ lwzux r0, r3,r5
+ stw r0, 8(r8)
+ lwz r0, 4(r3)
+ stw r0, 12(r8)
+ lvx v8, 0,r8
+ two_rows_horiz \Dest
+.endm
+
+ .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+ prologue
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+ prologue
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+ addi r10, r3, 0
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ ;# Next block
+ addi r3, r10, 8
+ addi r4, r4, 32
+ lvx v6, 0, r9 ;# v6 = Hround
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ epilogue
+
+ blr
+
+ .data
+ .align 4
+ppc_dctperm_tab:
+ .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+ .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+ .align 4
+dct_tab:
+ .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+ .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+ .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+ .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+ .align 4
+round_tab:
+ .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+ .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
diff --git a/vp8/encoder/ppc/rdopt_altivec.asm b/vp8/encoder/ppc/rdopt_altivec.asm
new file mode 100644
index 000000000..917bfe036
--- /dev/null
+++ b/vp8/encoder/ppc/rdopt_altivec.asm
@@ -0,0 +1,50 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ .globl vp8_block_error_ppc
+
+ .align 2
+;# r3 short *Coeff
+;# r4 short *dqcoeff
+vp8_block_error_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf800
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ stw r5, 12(r1) ;# tranfer dc to vector register
+
+ lvx v0, 0, r3 ;# Coeff
+ lvx v1, 0, r4 ;# dqcoeff
+
+ li r10, 16
+
+ vspltisw v3, 0
+
+ vsubshs v0, v0, v1
+
+ vmsumshm v2, v0, v0, v3 ;# multiply differences
+
+ lvx v0, r10, r3 ;# Coeff
+ lvx v1, r10, r4 ;# dqcoeff
+
+ vsubshs v0, v0, v1
+
+ vmsumshm v1, v0, v0, v2 ;# multiply differences
+ vsumsws v1, v1, v3 ;# sum up
+
+ stvx v1, 0, r1
+ lwz r3, 12(r1) ;# return value
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
diff --git a/vp8/encoder/ppc/sad_altivec.asm b/vp8/encoder/ppc/sad_altivec.asm
new file mode 100644
index 000000000..1102ccf17
--- /dev/null
+++ b/vp8/encoder/ppc/sad_altivec.asm
@@ -0,0 +1,276 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ .globl vp8_sad16x16_ppc
+ .globl vp8_sad16x8_ppc
+ .globl vp8_sad8x16_ppc
+ .globl vp8_sad8x8_ppc
+ .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v8, 0 ;# zero out total to start
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+ lvsl v3, 0, r5 ;# only needs to be done once per block
+
+ ;# preload a line of data before getting into the loop
+ lvx v4, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ add r5, r5, r6
+ add r3, r3, r4
+
+ vperm v5, v1, v2, v3
+
+ .align 4
+\loop_label:
+ ;# compute difference on first row
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+
+ ;# load up next set of data
+ lvx v9, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ ;# perform abs() of difference
+ vor v6, v6, v7
+ add r3, r3, r4
+
+ ;# add to the running tally
+ vsum4ubs v8, v6, v8
+
+ ;# now onto the next line
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+ lvx v4, 0, r3
+
+ ;# compute difference on second row
+ vsububs v6, v9, v5
+ lvx v1, 0, r5
+ vsububs v7, v5, v9
+ lvx v2, r10, r5
+ vor v6, v6, v7
+ add r3, r3, r4
+ vsum4ubs v8, v6, v8
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+ .align 4
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v7, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v7
+
+ SAD_16
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_16_loop sad16x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_16_loop sad16x8_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_8_loop sad8x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_8_loop sad8x8_loop
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r7, 0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r7, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ vspltisw v8, 0 ;# zero out total to start
+
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v7, v6, v8
+ vsumsws v7, v7, v8
+
+ stvx v7, 0, r1
+ lwz r3, 12(r1)
+
+ epilogue
+
+ blr
diff --git a/vp8/encoder/ppc/variance_altivec.asm b/vp8/encoder/ppc/variance_altivec.asm
new file mode 100644
index 000000000..952bf7286
--- /dev/null
+++ b/vp8/encoder/ppc/variance_altivec.asm
@@ -0,0 +1,374 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ .globl vp8_get8x8var_ppc
+ .globl vp8_get16x16var_ppc
+ .globl vp8_mse16x16_ppc
+ .globl vp8_variance16x16_ppc
+ .globl vp8_variance16x8_ppc
+ .globl vp8_variance8x16_ppc
+ .globl vp8_variance8x8_ppc
+ .globl vp8_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v7, 0 ;# zero for merging
+ vspltisw v8, 0 ;# zero out total to start
+ vspltisw v9, 0 ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb v2, v7, v4
+ vmrghb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ vmrglb v2, v7, v4
+ vmrglb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, \DS ;# (sum*sum) >> DS
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v0, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v0
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, get8x8var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, get16x16var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+ prologue
+
+ mtctr r10
+
+mse16x16_loop:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+
+ bdnz mse16x16_loop
+
+ vsumsws v9, v9, v7
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stw r3, 0(r7) ;# sse
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x16_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, variance16x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x8_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_16 7, variance16x8_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_8 7, variance8x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, variance8x8_loop, 0
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r10,0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r10, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ compute_sum_sse
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, 4 ;# (sum*sum) >> 4
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
+
+ epilogue
+
+ blr
diff --git a/vp8/encoder/ppc/variance_subpixel_altivec.asm b/vp8/encoder/ppc/variance_subpixel_altivec.asm
new file mode 100644
index 000000000..148a8d25b
--- /dev/null
+++ b/vp8/encoder/ppc/variance_subpixel_altivec.asm
@@ -0,0 +1,864 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ .globl vp8_sub_pixel_variance4x4_ppc
+ .globl vp8_sub_pixel_variance8x8_ppc
+ .globl vp8_sub_pixel_variance8x16_ppc
+ .globl vp8_sub_pixel_variance16x8_ppc
+ .globl vp8_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r12, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r12, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r12, r0
+
+ ;# index to the next set of vectors in the row.
+ li r12, 32
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb \t1, \z0, \src
+ vmrghb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ vmrglb \t1, \z0, \src
+ vmrglb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ ;# Now compute sse.
+ vsububs \t1, \src, \ref
+ vsububs \t2, \ref, \src
+ vor \t1, \t1, \t2
+
+ vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+ vsumsws \sum, \sum, \z0
+ vsumsws \sse, \sse, \z0
+
+ stvx \sum, 0, r1
+ lwz r3, 12(r1)
+
+ stvx \sse, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r9) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+ load_and_align_16 v16, r7, r8, \increment_counter
+ compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+ lvsl v17, 0, \R ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, \R
+ lvx v22, r10, \R
+
+.if \increment_counter
+ add \R, \R, \P
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_4x4_b
+
+ hfilter_8 v4, v10, v11, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+compute_sum_sse_4x4_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_c v10, b_hilo_b, 0, r12, r0
+
+ vperm v0, v0, v1, v10
+ vperm v1, v2, v3, v10
+
+ compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 4
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+ hfilter_8 v4, v10, v11, 1
+ hfilter_8 v5, v10, v11, 1
+ hfilter_8 v6, v10, v11, 1
+ hfilter_8 v7, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x8_b
+
+ hfilter_8 v8, v10, v11, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 0
+
+ beq compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 0
+
+ vmrghb v4, v4, v5
+ vmrghb v5, v6, v7
+ vmrghb v6, v8, v9
+ vmrghb v7, v10, v11
+
+ compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 6
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x16_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v29, b_0123_b, 0, r12, r0
+ load_c v30, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v29, v30, 1
+ hfilter_8 v1, v29, v30, 1
+ hfilter_8 v2, v29, v30, 1
+ hfilter_8 v3, v29, v30, 1
+ hfilter_8 v4, v29, v30, 1
+ hfilter_8 v5, v29, v30, 1
+ hfilter_8 v6, v29, v30, 1
+ hfilter_8 v7, v29, v30, 1
+ hfilter_8 v8, v29, v30, 1
+ hfilter_8 v9, v29, v30, 1
+ hfilter_8 v10, v29, v30, 1
+ hfilter_8 v11, v29, v30, 1
+ hfilter_8 v12, v29, v30, 1
+ hfilter_8 v13, v29, v30, 1
+ hfilter_8 v14, v29, v30, 1
+ hfilter_8 v15, v29, v30, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x16_b
+
+ hfilter_8 v16, v29, v30, 0
+
+ b second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+ vmrghb v4, v8, v9
+ vmrghb v5, v10, v11
+ vmrghb v6, v12, v13
+ vmrghb v7, v14, v15
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 1
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v0, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 0
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v4, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v5, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x8_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x8_b
+
+ hfilter_16 v8, 0
+
+ b second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+
+ beq compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_16x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 0
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 1
+ compute_sum_sse_16 v8, 1
+ compute_sum_sse_16 v9, 1
+ compute_sum_sse_16 v10, 1
+ compute_sum_sse_16 v11, 1
+ compute_sum_sse_16 v12, 1
+ compute_sum_sse_16 v13, 1
+ compute_sum_sse_16 v14, 1
+ compute_sum_sse_16 v15, 0
+
+ variance_final v18, v19, v23, 8
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/encoder/preproc.c b/vp8/encoder/preproc.c
new file mode 100644
index 000000000..d2a13dced
--- /dev/null
+++ b/vp8/encoder/preproc.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : preproc.c
+*
+* Description : Simple pre-processor.
+*
+****************************************************************************/
+
+/****************************************************************************
+* Header Files
+****************************************************************************/
+
+#include "memory.h"
+#include "preproc7.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+* Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+* Imports
+****************************************************************************/
+extern void vp8_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+* Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+void temp_filter_mmx
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+);
+void temp_filter_wmt
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+);
+
+/****************************************************************************
+ *
+ * ROUTINE : temp_filter_c
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ * unsigned char *s : Pointer to source frame.
+ * unsigned char *d : Pointer to destination frame.
+ * int bytes : Number of bytes to filter.
+ * int strength : Strength of filter to apply.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs a closesness adjusted temporarl blur
+ *
+ * SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_c
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+)
+{
+ int byte = 0;
+ unsigned char *frameptr = ppi->frame_buffer;
+
+ if (ppi->frame == 0)
+ {
+ do
+ {
+ int frame = 0;
+
+ do
+ {
+ *frameptr = s[byte];
+ ++frameptr;
+ ++frame;
+ }
+ while (frame < FRAMECOUNT);
+
+ d[byte] = s[byte];
+
+ ++byte;
+ }
+ while (byte < bytes);
+ }
+ else
+ {
+ int modifier;
+ int offset = (ppi->frame % FRAMECOUNT);
+
+ do
+ {
+ int accumulator = 0;
+ int count = 0;
+ int frame = 0;
+
+ frameptr[offset] = s[byte];
+
+ do
+ {
+ int pixel_value = *frameptr;
+
+ modifier = s[byte];
+ modifier -= pixel_value;
+ modifier *= modifier;
+ modifier >>= strength;
+ modifier *= 3;
+
+ if (modifier > 16)
+ modifier = 16;
+
+ modifier = 16 - modifier;
+
+ accumulator += modifier * pixel_value;
+
+ count += modifier;
+
+ frameptr++;
+
+ ++frame;
+ }
+ while (frame < FRAMECOUNT);
+
+ accumulator += (count >> 1);
+ accumulator *= ppi->fixed_divide[count];
+ accumulator >>= 16;
+
+ d[byte] = accumulator;
+
+ ++byte;
+ }
+ while (byte < bytes);
+ }
+
+ ++ppi->frame;
+}
+/****************************************************************************
+ *
+ * ROUTINE : delete_pre_proc
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Deletes a pre-processing instance.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void delete_pre_proc(pre_proc_instance *ppi)
+{
+ if (ppi->frame_buffer_alloc)
+ vpx_free(ppi->frame_buffer_alloc);
+
+ ppi->frame_buffer_alloc = 0;
+ ppi->frame_buffer = 0;
+
+ if (ppi->fixed_divide_alloc)
+ vpx_free(ppi->fixed_divide_alloc);
+
+ ppi->fixed_divide_alloc = 0;
+ ppi->fixed_divide = 0;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : init_pre_proc
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ * int frame_size : Number of bytes in one frame.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : int: 1 if successful, 0 if failed.
+ *
+ * FUNCTION : Initializes prepprocessor instance.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int init_pre_proc7(pre_proc_instance *ppi, int frame_size)
+{
+ int i;
+ int mmx_enabled;
+ int xmm_enabled;
+ int wmt_enabled;
+
+ vp8_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+ if (wmt_enabled)
+ temp_filter = temp_filter_wmt;
+ else if (mmx_enabled)
+ temp_filter = temp_filter_mmx;
+ else
+ temp_filter = temp_filter_c;
+
+
+ delete_pre_proc(ppi);
+
+ ppi->frame_buffer_alloc = vpx_malloc(32 + frame_size * FRAMECOUNT * sizeof(unsigned char));
+
+ if (!ppi->frame_buffer_alloc)
+ {
+ delete_pre_proc(ppi);
+ return 0;
+ }
+
+ ppi->frame_buffer = (unsigned char *) ROUNDUP32(ppi->frame_buffer_alloc);
+
+ ppi->fixed_divide_alloc = vpx_malloc(32 + 255 * sizeof(unsigned int));
+
+ if (!ppi->fixed_divide_alloc)
+ {
+ delete_pre_proc(ppi);
+ return 0;
+ }
+
+ ppi->fixed_divide = (unsigned int *) ROUNDUP32(ppi->fixed_divide_alloc);
+
+ for (i = 1; i < 255; i++)
+ ppi->fixed_divide[i] = 0x10000 / i;
+
+ return 1;
+}
diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c
new file mode 100644
index 000000000..0e34cecb1
--- /dev/null
+++ b/vp8/encoder/psnr.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+#include "systemdependent.h" /* for vp8_clear_system_state() */
+
+#define MAX_PSNR 60
+
+double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+ double psnr;
+
+ if ((double)Mse > 0.0)
+ psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+ else
+ psnr = MAX_PSNR; // Limit to prevent / 0
+
+ if (psnr > MAX_PSNR)
+ psnr = MAX_PSNR;
+
+ return psnr;
+}
+
+double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error)
+{
+ int i, j;
+ int Diff;
+ double frame_psnr;
+ double Total;
+ double grand_total;
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ Total = 0.0;
+ grand_total = 0.0;
+
+ // Loop throught the Y plane raw and reconstruction data summing (square differences)
+ for (i = 0; i < source->y_height; i++)
+ {
+
+ for (j = 0; j < source->y_width; j++)
+ {
+ Diff = (int)(src[j]) - (int)(dst[j]);
+ Total += Diff * Diff;
+ }
+
+ src += source->y_stride;
+ dst += dest->y_stride;
+ }
+
+ // Work out Y PSNR
+ *YPsnr = vp8_mse2psnr(source->y_height * source->y_width, 255.0, Total);
+ grand_total += Total;
+ Total = 0;
+
+
+ // Loop through the U plane
+ src = source->u_buffer;
+ dst = dest->u_buffer;
+
+ for (i = 0; i < source->uv_height; i++)
+ {
+
+ for (j = 0; j < source->uv_width; j++)
+ {
+ Diff = (int)(src[j]) - (int)(dst[j]);
+ Total += Diff * Diff;
+ }
+
+ src += source->uv_stride;
+ dst += dest->uv_stride;
+ }
+
+ // Work out U PSNR
+ *UPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
+ grand_total += Total;
+ Total = 0;
+
+
+ // V PSNR
+ src = source->v_buffer;
+ dst = dest->v_buffer;
+
+ for (i = 0; i < source->uv_height; i++)
+ {
+
+ for (j = 0; j < source->uv_width; j++)
+ {
+ Diff = (int)(src[j]) - (int)(dst[j]);
+ Total += Diff * Diff;
+ }
+
+ src += source->uv_stride;
+ dst += dest->uv_stride;
+ }
+
+ // Work out UV PSNR
+ *VPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
+ grand_total += Total;
+ Total = 0;
+
+ // Work out total PSNR
+ frame_psnr = vp8_mse2psnr(source->y_height * source->y_width * 3 / 2 , 255.0, grand_total);
+
+ *sq_error = 1.0 * grand_total;
+
+ return frame_psnr;
+}
diff --git a/vp8/encoder/psnr.h b/vp8/encoder/psnr.h
new file mode 100644
index 000000000..9f6ca0bbf
--- /dev/null
+++ b/vp8/encoder/psnr.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PSNR_H
+#define __INC_PSNR_H
+
+extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
+extern double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error);
+
+#endif
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
new file mode 100644
index 000000000..6028ebf56
--- /dev/null
+++ b/vp8/encoder/quantize.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "quantize.h"
+#include "entropy.h"
+#include "predictdc.h"
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+ zbin = zbin_ptr[rc] ;
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin)
+ {
+ y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y)
+ {
+ eob = i; // last nonzero coeffs
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+
+}
+
+void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = &b->zrun_zbin_boost[0];
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ //if ( i == 0 )
+ // zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;
+ //else
+ zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+ zbin_boost_ptr ++;
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin)
+ {
+ y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y)
+ {
+ eob = i; // last nonzero coeffs
+ zbin_boost_ptr = &b->zrun_zbin_boost[0]; // reset zero runlength
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+void vp8_quantize_mby(MACROBLOCK *x)
+{
+ int i;
+
+ if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+ }
+
+ x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob);
+
+ }
+ else
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+}
+
+void vp8_quantize_mb(MACROBLOCK *x)
+{
+ int i;
+
+ x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+ if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+ }
+
+ for (i = 16; i < 25; i++)
+ {
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+ else
+ {
+ for (i = 0; i < 24; i++)
+ {
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+
+}
+
+
+void vp8_quantize_mbuv(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i++)
+ {
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+}
+
+// This function is not currently called
+void vp8_quantize_mbrd(MACROBLOCK *x)
+{
+ int i;
+
+ x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+ if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+ }
+
+ for (i = 16; i < 25; i++)
+ {
+ x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+ else
+ {
+ for (i = 0; i < 24; i++)
+ {
+ x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+}
+
+void vp8_quantize_mbuvrd(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i++)
+ {
+ x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+}
+
+void vp8_quantize_mbyrd(MACROBLOCK *x)
+{
+ int i;
+
+ if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+ }
+
+ x->quantize_brd(&x->block[24], &x->e_mbd.block[24]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob);
+
+ }
+ else
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+ x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+ }
+ }
+}
diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
new file mode 100644
index 000000000..868e8e3a8
--- /dev/null
+++ b/vp8/encoder/quantize.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_QUANTIZE_H
+#define __INC_QUANTIZE_H
+
+#include "block.h"
+
+#define prototype_quantize_block(sym) \
+ void (sym)(BLOCK *b,BLOCKD *d)
+
+#if ARCH_ARM
+#include "arm/quantize_arm.h"
+#endif
+
+#ifndef vp8_quantize_quantb
+#define vp8_quantize_quantb vp8_regular_quantize_b
+#endif
+extern prototype_quantize_block(vp8_quantize_quantb);
+
+#ifndef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_c
+#endif
+extern prototype_quantize_block(vp8_quantize_fastquantb);
+
+typedef struct
+{
+ prototype_quantize_block(*quantb);
+ prototype_quantize_block(*fastquantb);
+} vp8_quantize_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define QUANTIZE_INVOKE(ctx,fn) vp8_quantize_##fn
+#endif
+
+extern void vp8_quantize_mb(MACROBLOCK *x);
+extern void vp8_quantize_mbuv(MACROBLOCK *x);
+extern void vp8_quantize_mby(MACROBLOCK *x);
+extern void vp8_quantize_mbyrd(MACROBLOCK *x);
+extern void vp8_quantize_mbuvrd(MACROBLOCK *x);
+extern void vp8_quantize_mbrd(MACROBLOCK *x);
+
+#endif
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
new file mode 100644
index 000000000..05040d310
--- /dev/null
+++ b/vp8/encoder/ratectrl.c
@@ -0,0 +1,1552 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "common.h"
+#include "ratectrl.h"
+#include "entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "systemdependent.h"
+#include "encodemv.h"
+
+
+#define MIN_BPB_FACTOR 0.01
+#define MAX_BPB_FACTOR 50
+
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+
+
+
+#ifdef MODE_STATS
+extern int y_modes[5];
+extern int uv_modes[4];
+extern int b_modes[10];
+
+extern int inter_y_modes[10];
+extern int inter_uv_modes[4];
+extern int inter_b_modes[10];
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+const int vp8_bits_per_mb[2][QINDEX_RANGE] =
+{
+ // (Updated 19 March 08) Baseline estimate of INTRA-frame Bits Per MB at each Q:
+ {
+ 674781, 606845, 553905, 524293, 500428, 452540, 435379, 414719,
+ 390970, 371082, 359416, 341807, 336957, 317263, 303724, 298402,
+ 285688, 275237, 268455, 262560, 256038, 248734, 241087, 237615,
+ 229247, 225211, 219112, 213920, 211559, 202714, 198482, 193401,
+ 187866, 183453, 179212, 175965, 171852, 167235, 163972, 160560,
+ 156032, 154349, 151390, 148725, 145708, 142311, 139981, 137700,
+ 134084, 131863, 129746, 128498, 126077, 123461, 121290, 117782,
+ 114883, 112332, 108410, 105685, 103434, 101192, 98587, 95959,
+ 94059, 92017, 89970, 87936, 86142, 84801, 82736, 81106,
+ 79668, 78135, 76641, 75103, 73943, 72693, 71401, 70098,
+ 69165, 67901, 67170, 65987, 64923, 63534, 62378, 61302,
+ 59921, 58941, 57844, 56782, 55960, 54973, 54257, 53454,
+ 52230, 50938, 49962, 49190, 48288, 47270, 46738, 46037,
+ 45020, 44027, 43216, 42287, 41594, 40702, 40081, 39414,
+ 38282, 37627, 36987, 36375, 35808, 35236, 34710, 34162,
+ 33659, 33327, 32751, 32384, 31936, 31461, 30982, 30582,
+ },
+
+ // (Updated 19 March 08) Baseline estimate of INTER-frame Bits Per MB at each Q:
+ {
+ 497401, 426316, 372064, 352732, 335763, 283921, 273848, 253321,
+ 233181, 217727, 210030, 196685, 194836, 178396, 167753, 164116,
+ 154119, 146929, 142254, 138488, 133591, 127741, 123166, 120226,
+ 114188, 111756, 107882, 104749, 102522, 96451, 94424, 90905,
+ 87286, 84931, 82111, 80534, 77610, 74700, 73037, 70715,
+ 68006, 67235, 65374, 64009, 62134, 60180, 59105, 57691,
+ 55509, 54512, 53318, 52693, 51194, 49840, 48944, 46980,
+ 45668, 44177, 42348, 40994, 39859, 38889, 37717, 36391,
+ 35482, 34622, 33795, 32756, 32002, 31492, 30573, 29737,
+ 29152, 28514, 27941, 27356, 26859, 26329, 25874, 25364,
+ 24957, 24510, 24290, 23689, 23380, 22845, 22481, 22066,
+ 21587, 21219, 20880, 20452, 20260, 19926, 19661, 19334,
+ 18915, 18391, 18046, 17833, 17441, 17105, 16888, 16729,
+ 16383, 16023, 15706, 15442, 15222, 14938, 14673, 14452,
+ 14005, 13807, 13611, 13447, 13223, 13102, 12963, 12801,
+ 12627, 12534, 12356, 12228, 12056, 11907, 11746, 11643,
+ }
+};
+
+const int vp8_kf_boost_qadjustment[QINDEX_RANGE] =
+{
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183,
+ 184, 185, 186, 187, 188, 189, 190, 191,
+ 192, 193, 194, 195, 196, 197, 198, 199,
+ 200, 200, 201, 201, 202, 203, 203, 203,
+ 204, 204, 205, 205, 206, 206, 207, 207,
+ 208, 208, 209, 209, 210, 210, 211, 211,
+ 212, 212, 213, 213, 214, 214, 215, 215,
+ 216, 216, 217, 217, 218, 218, 219, 219,
+ 220, 220, 220, 220, 220, 220, 220, 220,
+ 220, 220, 220, 220, 220, 220, 220, 220,
+};
+
+//#define GFQ_ADJUSTMENT (Q+100)
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183,
+ 184, 184, 185, 185, 186, 186, 187, 187,
+ 188, 188, 189, 189, 190, 190, 191, 191,
+ 192, 192, 193, 193, 194, 194, 194, 194,
+ 195, 195, 196, 196, 197, 197, 198, 198
+};
+
+/*
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+ 100,101,102,103,104,105,105,106,
+ 106,107,107,108,109,109,110,111,
+ 112,113,114,115,116,117,118,119,
+ 120,121,122,123,124,125,126,127,
+ 128,129,130,131,132,133,134,135,
+ 136,137,138,139,140,141,142,143,
+ 144,145,146,147,148,149,150,151,
+ 152,153,154,155,156,157,158,159,
+ 160,161,162,163,164,165,166,167,
+ 168,169,170,170,171,171,172,172,
+ 173,173,173,174,174,174,175,175,
+ 175,176,176,176,177,177,177,177,
+ 178,178,179,179,180,180,181,181,
+ 182,182,183,183,184,184,185,185,
+ 186,186,187,187,188,188,189,189,
+ 190,190,191,191,192,192,193,193,
+};
+*/
+
+const int vp8_kf_gf_boost_qlimits[QINDEX_RANGE] =
+{
+ 150, 155, 160, 165, 170, 175, 180, 185,
+ 190, 195, 200, 205, 210, 215, 220, 225,
+ 230, 235, 240, 245, 250, 255, 260, 265,
+ 270, 275, 280, 285, 290, 295, 300, 305,
+ 310, 320, 330, 340, 350, 360, 370, 380,
+ 390, 400, 410, 420, 430, 440, 450, 460,
+ 470, 480, 490, 500, 510, 520, 530, 540,
+ 550, 560, 570, 580, 590, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+};
+
+// % adjustment to target kf size based on seperation from previous frame
+const int vp8_kf_boost_seperationt_adjustment[16] =
+{
+ 30, 40, 50, 55, 60, 65, 70, 75,
+ 80, 85, 90, 95, 100, 100, 100, 100,
+};
+
+
+const int vp8_gf_adjust_table[101] =
+{
+ 100,
+ 115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+ 240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+ 350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+const int vp8_gf_intra_useage_adjustment[20] =
+{
+ 125, 120, 115, 110, 105, 100, 95, 85, 80, 75,
+ 70, 65, 60, 55, 50, 50, 50, 50, 50, 50,
+};
+
+const int vp8_gf_interval_table[101] =
+{
+ 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+
+void vp8_save_coding_context(VP8_COMP *cpi)
+{
+ CODING_CONTEXT *const cc = & cpi->coding_context;
+
+ // Stores a snapshot of key state variables which can subsequently be
+ // restored with a call to vp8_restore_coding_context. These functions are
+ // intended for use in a re-code loop in vp8_compress_frame where the
+ // quantizer value is adjusted between loop iterations.
+
+ cc->frames_since_key = cpi->frames_since_key;
+ cc->filter_level = cpi->common.filter_level;
+ cc->frames_till_gf_update_due = cpi->frames_till_gf_update_due;
+ cc->frames_since_golden = cpi->common.frames_since_golden;
+
+ vp8_copy(cc->mvc, cpi->common.fc.mvc);
+ vp8_copy(cc->mvcosts, cpi->mb.mvcosts);
+
+ vp8_copy(cc->kf_ymode_prob, cpi->common.kf_ymode_prob);
+ vp8_copy(cc->ymode_prob, cpi->common.fc.ymode_prob);
+ vp8_copy(cc->kf_uv_mode_prob, cpi->common.kf_uv_mode_prob);
+ vp8_copy(cc->uv_mode_prob, cpi->common.fc.uv_mode_prob);
+
+ vp8_copy(cc->ymode_count, cpi->ymode_count);
+ vp8_copy(cc->uv_mode_count, cpi->uv_mode_count);
+
+
+ // Stats
+#ifdef MODE_STATS
+ vp8_copy(cc->y_modes, y_modes);
+ vp8_copy(cc->uv_modes, uv_modes);
+ vp8_copy(cc->b_modes, b_modes);
+ vp8_copy(cc->inter_y_modes, inter_y_modes);
+ vp8_copy(cc->inter_uv_modes, inter_uv_modes);
+ vp8_copy(cc->inter_b_modes, inter_b_modes);
+#endif
+
+ cc->this_frame_percent_intra = cpi->this_frame_percent_intra;
+}
+
+
+void vp8_restore_coding_context(VP8_COMP *cpi)
+{
+ CODING_CONTEXT *const cc = & cpi->coding_context;
+
+ // Restore key state variables to the snapshot state stored in the
+ // previous call to vp8_save_coding_context.
+
+ cpi->frames_since_key = cc->frames_since_key;
+ cpi->common.filter_level = cc->filter_level;
+ cpi->frames_till_gf_update_due = cc->frames_till_gf_update_due;
+ cpi->common.frames_since_golden = cc->frames_since_golden;
+
+ vp8_copy(cpi->common.fc.mvc, cc->mvc);
+
+ vp8_copy(cpi->mb.mvcosts, cc->mvcosts);
+
+ vp8_copy(cpi->common.kf_ymode_prob, cc->kf_ymode_prob);
+ vp8_copy(cpi->common.fc.ymode_prob, cc->ymode_prob);
+ vp8_copy(cpi->common.kf_uv_mode_prob, cc->kf_uv_mode_prob);
+ vp8_copy(cpi->common.fc.uv_mode_prob, cc->uv_mode_prob);
+
+ vp8_copy(cpi->ymode_count, cc->ymode_count);
+ vp8_copy(cpi->uv_mode_count, cc->uv_mode_count);
+
+ // Stats
+#ifdef MODE_STATS
+ vp8_copy(y_modes, cc->y_modes);
+ vp8_copy(uv_modes, cc->uv_modes);
+ vp8_copy(b_modes, cc->b_modes);
+ vp8_copy(inter_y_modes, cc->inter_y_modes);
+ vp8_copy(inter_uv_modes, cc->inter_uv_modes);
+ vp8_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+
+ cpi->this_frame_percent_intra = cc->this_frame_percent_intra;
+}
+
+
+void vp8_setup_key_frame(VP8_COMP *cpi)
+{
+ // Setup for Key frame:
+
+ vp8_default_coef_probs(& cpi->common);
+ vp8_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
+
+ vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ {
+ int flag[2] = {1, 1};
+ vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
+ }
+
+ vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc)); //initialize pre_mvc to all zero.
+
+ //cpi->common.filter_level = 0; // Reset every key frame.
+ cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
+
+ // Provisional interval before next GF
+ if (cpi->auto_gold)
+ //cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ else
+ cpi->frames_till_gf_update_due = cpi->goldfreq;
+
+ cpi->common.refresh_golden_frame = TRUE;
+}
+
+void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
+{
+ // boost defaults to half second
+ int kf_boost;
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp8_clear_system_state(); //__asm emms;
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ vp8_calc_iframe_target_size(cpi);
+ return;
+ }
+
+ if (cpi->pass == 2)
+ {
+ cpi->this_frame_target = cpi->per_frame_bandwidth; // New Two pass RC
+ }
+ else
+ {
+ // Boost depends somewhat on frame rate
+ kf_boost = (int)(2 * cpi->output_frame_rate - 16);
+
+ // adjustment up based on q
+ kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+
+ // frame separation adjustment ( down)
+ if (cpi->frames_since_key < cpi->output_frame_rate / 2)
+ kf_boost = (int)(kf_boost * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+
+ if (kf_boost < 16)
+ kf_boost = 16;
+
+ // Reset the active worst quality to the baseline value for key frames.
+ cpi->active_worst_quality = cpi->worst_quality;
+
+ cpi->this_frame_target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+ }
+
+
+ // Should the next frame be an altref frame
+ if (cpi->pass != 2)
+ {
+ // For now Alt ref is not allowed except in 2 pass modes.
+ cpi->source_alt_ref_pending = FALSE;
+
+ /*if ( cpi->oxcf.fixed_q == -1)
+ {
+ if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+ cpi->source_alt_ref_pending = TRUE;
+ else
+ cpi->source_alt_ref_pending = FALSE;
+ }*/
+ }
+
+ if (0)
+ {
+ FILE *f;
+
+ f = fopen("kf_boost.stt", "a");
+ //fprintf(f, " %8d %10d %10d %10d %10d %10d %10d\n",
+ // cpi->common.current_video_frame, cpi->target_bandwidth, cpi->frames_to_key, kf_boost_qadjustment[cpi->ni_av_qi], cpi->kf_boost, (cpi->this_frame_target *100 / cpi->per_frame_bandwidth), cpi->this_frame_target );
+
+ fprintf(f, " %8u %10d %10d %10d\n",
+ cpi->common.current_video_frame, cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending);
+
+ fclose(f);
+ }
+}
+
+// Do the best we can to define the parameteres for the next GF based on what information we have available.
+static void calc_gf_params(VP8_COMP *cpi)
+{
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int Boost = 0;
+
+ int gf_frame_useage = 0; // Golden frame useage since last GF
+ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
+ cpi->recent_ref_frame_usage[LAST_FRAME] +
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+ cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+ int pct_gf_active = (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+ // Reset the last boost indicator
+ //cpi->last_boost = 100;
+
+ if (tot_mbs)
+ gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+ if (pct_gf_active > gf_frame_useage)
+ gf_frame_useage = pct_gf_active;
+
+ // Not two pass
+ if (cpi->pass != 2)
+ {
+ // Single Pass lagged mode: TBD
+ if (FALSE)
+ {
+ }
+
+ // Single Pass compression: Has to use current and historical data
+ else
+ {
+#if 0
+ // Experimental code
+ int index = cpi->one_pass_frame_index;
+ int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS;
+
+ /*
+ // *************** Experimental code - incomplete
+ double decay_val = 1.0;
+ double IIAccumulator = 0.0;
+ double last_iiaccumulator = 0.0;
+ double IIRatio;
+
+ cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS;
+
+ for ( i = 0; i < (frames_to_scan - 1); i++ )
+ {
+ if ( index < 0 )
+ index = MAX_LAG_BUFFERS;
+ index --;
+
+ if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 )
+ {
+ IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error;
+
+ if ( IIRatio > 30.0 )
+ IIRatio = 30.0;
+ }
+ else
+ IIRatio = 30.0;
+
+ IIAccumulator += IIRatio * decay_val;
+
+ decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter;
+
+ if ( (i > MIN_GF_INTERVAL) &&
+ ((IIAccumulator - last_iiaccumulator) < 2.0) )
+ {
+ break;
+ }
+ last_iiaccumulator = IIAccumulator;
+ }
+
+ Boost = IIAccumulator*100.0/16.0;
+ cpi->baseline_gf_interval = i;
+
+ */
+#else
+
+ /*************************************************************/
+ // OLD code
+
+ // Adjust boost based upon ambient Q
+ Boost = GFQ_ADJUSTMENT;
+
+ // Adjust based upon most recently measure intra useage
+ Boost = Boost * vp8_gf_intra_useage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+
+ // Adjust gf boost based upon GF usage since last GF
+ Boost = Boost * vp8_gf_adjust_table[gf_frame_useage] / 100;
+#endif
+ }
+
+ // golden frame boost without recode loop often goes awry. be safe by keeping numbers down.
+ if (!cpi->sf.recode_loop)
+ {
+ if (cpi->compressor_speed == 2)
+ Boost = Boost / 2;
+ }
+
+ // Apply an upper limit based on Q for 1 pass encodes
+ if (Boost > vp8_kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+ Boost = vp8_kf_gf_boost_qlimits[Q];
+
+ // Apply lower limits to boost.
+ else if (Boost < 110)
+ Boost = 110;
+
+ // Note the boost used
+ cpi->last_boost = Boost;
+
+ }
+
+ // Estimate next interval
+ // This is updated once the real frame size/boost is known.
+ if (cpi->oxcf.fixed_q == -1)
+ {
+ if (cpi->pass == 2) // 2 Pass
+ {
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ }
+ else // 1 Pass
+ {
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+ if (cpi->last_boost > 750)
+ cpi->frames_till_gf_update_due++;
+
+ if (cpi->last_boost > 1000)
+ cpi->frames_till_gf_update_due++;
+
+ if (cpi->last_boost > 1250)
+ cpi->frames_till_gf_update_due++;
+
+ if (cpi->last_boost >= 1500)
+ cpi->frames_till_gf_update_due ++;
+
+ if (vp8_gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+ cpi->frames_till_gf_update_due = vp8_gf_interval_table[gf_frame_useage];
+
+ if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
+ cpi->frames_till_gf_update_due = cpi->max_gf_interval;
+ }
+ }
+ else
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+ // ARF on or off
+ if (cpi->pass != 2)
+ {
+ // For now Alt ref is not allowed except in 2 pass modes.
+ cpi->source_alt_ref_pending = FALSE;
+
+ /*if ( cpi->oxcf.fixed_q == -1)
+ {
+ if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+ cpi->source_alt_ref_pending = TRUE;
+ else
+ cpi->source_alt_ref_pending = FALSE;
+ }*/
+ }
+}
+/* This is equvialent to estimate_bits_at_q without the rate_correction_factor. */
+static int baseline_bits_at_q(int frame_kind, int Q, int MBs)
+{
+ int Bpm = vp8_bits_per_mb[frame_kind][Q];
+
+ /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+ * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+ * largest Bpm takes 20 bits.
+ */
+ if (MBs > (1 << 11))
+ return (Bpm >> BPER_MB_NORMBITS) * MBs;
+ else
+ return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+void vp8_calc_iframe_target_size(VP8_COMP *cpi)
+{
+ int Q;
+ int Boost = 100;
+
+ Q = (cpi->oxcf.fixed_q >= 0) ? cpi->oxcf.fixed_q : cpi->avg_frame_qindex;
+
+ if (cpi->auto_adjust_key_quantizer == 1)
+ {
+ // If (auto_adjust_key_quantizer==1) then a lower Q is selected for key-frames.
+ // The enhanced Q is calculated so as to boost the key frame size by a factor
+ // specified in kf_boost_qadjustment. Also, can adjust based on distance
+ // between key frames.
+
+ // Adjust boost based upon ambient Q
+ Boost = vp8_kf_boost_qadjustment[Q];
+
+ // Make the Key frame boost less if the seperation from the previous key frame is small
+ if (cpi->frames_since_key < 16)
+ Boost = Boost * vp8_kf_boost_seperationt_adjustment[cpi->frames_since_key] / 100;
+ else
+ Boost = Boost * vp8_kf_boost_seperationt_adjustment[15] / 100;
+
+ // Apply limits on boost
+ if (Boost > vp8_kf_gf_boost_qlimits[Q])
+ Boost = vp8_kf_gf_boost_qlimits[Q];
+ else if (Boost < 120)
+ Boost = 120;
+ }
+
+ // Keep a record of the boost that was used
+ cpi->last_boost = Boost;
+
+ // Should the next frame be an altref frame
+ if (cpi->pass != 2)
+ {
+ // For now Alt ref is not allowed except in 2 pass modes.
+ cpi->source_alt_ref_pending = FALSE;
+
+ /*if ( cpi->oxcf.fixed_q == -1)
+ {
+ if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+ cpi->source_alt_ref_pending = TRUE;
+ else
+ cpi->source_alt_ref_pending = FALSE;
+ }*/
+ }
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ cpi->this_frame_target = (baseline_bits_at_q(0, Q, cpi->common.MBs) * Boost) / 100;
+ }
+ else
+ {
+
+ int bits_per_mb_at_this_q ;
+
+ if (cpi->oxcf.error_resilient_mode == 1)
+ {
+ cpi->this_frame_target = 2 * cpi->av_per_frame_bandwidth;
+ return;
+ }
+
+ // Rate targetted scenario:
+ // Be careful of 32-bit OVERFLOW if restructuring the caluclation of cpi->this_frame_target
+ bits_per_mb_at_this_q = (int)(.5 +
+ cpi->key_frame_rate_correction_factor * vp8_bits_per_mb[0][Q]);
+
+ cpi->this_frame_target = (((bits_per_mb_at_this_q * cpi->common.MBs) >> BPER_MB_NORMBITS) * Boost) / 100;
+
+ // Reset the active worst quality to the baseline value for key frames.
+ if (cpi->pass < 2)
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+}
+
+
+
+void vp8_calc_pframe_target_size(VP8_COMP *cpi)
+{
+ int min_frame_target;
+ int Adjustment;
+
+ // Set the min frame bandwidth.
+ //min_frame_target = estimate_min_frame_size( cpi );
+ min_frame_target = 0;
+
+ if (cpi->pass == 2)
+ {
+ min_frame_target = cpi->min_frame_bandwidth;
+
+ if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+ min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+ }
+ else if (min_frame_target < cpi->per_frame_bandwidth / 4)
+ min_frame_target = cpi->per_frame_bandwidth / 4;
+
+
+ // Special alt reference frame case
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ if (cpi->pass == 2)
+ {
+ cpi->per_frame_bandwidth = cpi->gf_bits; // Per frame bit target for the alt ref frame
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+
+ /* One Pass ??? TBD */
+ /*else
+ {
+ int frames_in_section;
+ int allocation_chunks;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int alt_boost;
+ int max_arf_rate;
+
+ alt_boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+ alt_boost += (cpi->frames_till_gf_update_due * 50);
+
+ // If alt ref is not currently active then we have a pottential double hit with GF and ARF so reduce the boost a bit.
+ // A similar thing is done on GFs that preceed a arf update.
+ if ( !cpi->source_alt_ref_active )
+ alt_boost = alt_boost * 3 / 4;
+
+ frames_in_section = cpi->frames_till_gf_update_due+1; // Standard frames + GF
+ allocation_chunks = (frames_in_section * 100) + alt_boost;
+
+ // Normalize Altboost and allocations chunck down to prevent overflow
+ while ( alt_boost > 1000 )
+ {
+ alt_boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ else
+ {
+ int bits_in_section;
+
+ if ( cpi->kf_overspend_bits > 0 )
+ {
+ Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+ if ( Adjustment > (cpi->per_frame_bandwidth - min_frame_target) )
+ Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+ cpi->kf_overspend_bits -= Adjustment;
+
+ // Calculate an inter frame bandwidth target for the next few frames designed to recover
+ // any extra bits spent on the key frame.
+ cpi->inter_frame_target = cpi->per_frame_bandwidth - Adjustment;
+ if ( cpi->inter_frame_target < min_frame_target )
+ cpi->inter_frame_target = min_frame_target;
+ }
+ else
+ cpi->inter_frame_target = cpi->per_frame_bandwidth;
+
+ bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+ // Avoid loss of precision but avoid overflow
+ if ( (bits_in_section>>7) > allocation_chunks )
+ cpi->this_frame_target = alt_boost * (bits_in_section / allocation_chunks);
+ else
+ cpi->this_frame_target = (alt_boost * bits_in_section) / allocation_chunks;
+ }
+ }
+ */
+ }
+
+ // Normal frames (gf,and inter)
+ else
+ {
+ // 2 pass
+ if (cpi->pass == 2)
+ {
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+ // 1 pass
+ else
+ {
+ // Make rate adjustment to recover bits spent in key frame
+ // Test to see if the key frame inter data rate correction should still be in force
+ if (cpi->kf_overspend_bits > 0)
+ {
+ Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+ if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target))
+ Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+ cpi->kf_overspend_bits -= Adjustment;
+
+ // Calculate an inter frame bandwidth target for the next few frames designed to recover
+ // any extra bits spent on the key frame.
+ cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment;
+
+ if (cpi->this_frame_target < min_frame_target)
+ cpi->this_frame_target = min_frame_target;
+ }
+ else
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+
+ // If appropriate make an adjustment to recover bits spent on a recent GF
+ if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
+ {
+ int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
+
+ if (Adjustment > (cpi->this_frame_target - min_frame_target))
+ Adjustment = (cpi->this_frame_target - min_frame_target);
+
+ cpi->gf_overspend_bits -= Adjustment;
+ cpi->this_frame_target -= Adjustment;
+ }
+
+ // Apply small + and - boosts for non gf frames
+ if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) &&
+ (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1)))
+ {
+ // % Adjustment limited to the range 1% to 10%
+ Adjustment = (cpi->last_boost - 100) >> 5;
+
+ if (Adjustment < 1)
+ Adjustment = 1;
+ else if (Adjustment > 10)
+ Adjustment = 10;
+
+ // Convert to bits
+ Adjustment = (cpi->this_frame_target * Adjustment) / 100;
+
+ if (Adjustment > (cpi->this_frame_target - min_frame_target))
+ Adjustment = (cpi->this_frame_target - min_frame_target);
+
+ if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+ cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+ else
+ cpi->this_frame_target -= Adjustment;
+ }
+ }
+ }
+
+ // Set a reduced data rate target for our initial Q calculation.
+ // This should help to save bits during earier sections.
+ if ((cpi->oxcf.under_shoot_pct > 0) && (cpi->oxcf.under_shoot_pct <= 100))
+ cpi->this_frame_target = (cpi->this_frame_target * cpi->oxcf.under_shoot_pct) / 100;
+
+ // Sanity check that the total sum of adjustments is not above the maximum allowed
+ // That is that having allowed for KF and GF penalties we have not pushed the
+ // current interframe target to low. If the adjustment we apply here is not capable of recovering
+ // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
+ // a longer time span via other buffer / rate control mechanisms.
+ if (cpi->this_frame_target < min_frame_target)
+ cpi->this_frame_target = min_frame_target;
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ // Note the baseline target data rate for this inter frame.
+ cpi->inter_frame_target = cpi->this_frame_target;
+
+ // One Pass specific code
+ if (cpi->pass == 0)
+ {
+ // Adapt target frame size with respect to any buffering constraints:
+ if (cpi->buffered_mode)
+ {
+ int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;
+
+ if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+ {
+ int percent_low = 0;
+
+ // Decide whether or not we need to adjust the frame data rate target.
+ //
+ // If we are are below the optimal buffer fullness level and adherence
+ // to buffering contraints is important to the end useage then adjust
+ // the per frame target.
+ if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+ {
+ percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;
+
+ if (percent_low > 100)
+ percent_low = 100;
+ else if (percent_low < 0)
+ percent_low = 0;
+ }
+ // Are we overshooting the long term clip data rate...
+ else if (cpi->bits_off_target < 0)
+ {
+ // Adjust per frame data target downwards to compensate.
+ percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));
+
+ if (percent_low > 100)
+ percent_low = 100;
+ else if (percent_low < 0)
+ percent_low = 0;
+ }
+
+ // lower the target bandwidth for this frame.
+ cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+
+ // Are we using allowing control of active_worst_allowed_q according to buffer level.
+ if (cpi->auto_worst_q)
+ {
+ int critical_buffer_level;
+
+ // For streaming applications the most important factor is cpi->buffer_level as this takes
+ // into account the specified short term buffering constraints. However, hitting the long
+ // term clip data rate target is also important.
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ // Take the smaller of cpi->buffer_level and cpi->bits_off_target
+ critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;
+ }
+ // For local file playback short term buffering contraints are less of an issue
+ else
+ {
+ // Consider only how we are doing for the clip as a whole
+ critical_buffer_level = cpi->bits_off_target;
+ }
+
+ // Set the active worst quality based upon the selected buffer fullness number.
+ if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
+ {
+ if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))
+ {
+ int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;
+ int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));
+
+ // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)
+ // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)
+ cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->ni_av_qi;
+ }
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+ else
+ {
+ int percent_high;
+
+ if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+ {
+ percent_high = (int)(100 * (cpi->bits_off_target - cpi->oxcf.optimal_buffer_level) / (cpi->total_byte_count * 8));
+
+ if (percent_high > 100)
+ percent_high = 100;
+ else if (percent_high < 0)
+ percent_high = 0;
+
+ cpi->this_frame_target = (cpi->this_frame_target * (100 + (percent_high / 2))) / 100;
+
+ }
+
+ // Are we allowing control of active_worst_allowed_q according to bufferl level.
+ if (cpi->auto_worst_q)
+ {
+ // When using the relaxed buffer model stick to the user specified value
+ cpi->active_worst_quality = cpi->ni_av_qi;
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+
+ // Set active_best_quality to prevent quality rising too high
+ cpi->active_best_quality = cpi->best_quality;
+
+ // Worst quality obviously must not be better than best quality
+ if (cpi->active_worst_quality <= cpi->active_best_quality)
+ cpi->active_worst_quality = cpi->active_best_quality + 1;
+
+ }
+ // Unbuffered mode (eg. video conferencing)
+ else
+ {
+ // Set the active worst quality
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+
+ // Test to see if we have to drop a frame
+ // The auto-drop frame code is only used in buffered mode.
+ // In unbufferd mode (eg vide conferencing) the descision to
+ // code or drop a frame is made outside the codec in response to real
+ // world comms or buffer considerations.
+ if (cpi->drop_frames_allowed && cpi->buffered_mode &&
+ (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ ((cpi->common.frame_type != KEY_FRAME))) //|| !cpi->oxcf.allow_spatial_resampling) )
+ {
+ // Check for a buffer underun-crisis in which case we have to drop a frame
+ if ((cpi->buffer_level < 0))
+ {
+#if 0
+ FILE *f = fopen("dec.stt", "a");
+ fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n",
+ (int) cpi->common.current_video_frame,
+ cpi->decimation_factor, cpi->common.horiz_scale,
+ (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level);
+ fclose(f);
+#endif
+ //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth);
+
+ cpi->drop_frame = TRUE;
+ }
+
+#if 0
+ // Check for other drop frame crtieria (Note 2 pass cbr uses decimation on whole KF sections)
+ else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
+ (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0))
+ {
+ cpi->drop_frame = TRUE;
+ }
+
+#endif
+
+ if (cpi->drop_frame)
+ {
+ // Update the buffer level variable.
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+ cpi->buffer_level = cpi->bits_off_target;
+ }
+ else
+ cpi->drop_count = 0;
+ }
+
+ // Adjust target frame size for Golden Frames:
+ if (cpi->oxcf.error_resilient_mode == 0 &&
+ (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame)
+ {
+ //int Boost = 0;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+ int gf_frame_useage = 0; // Golden frame useage since last GF
+ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
+ cpi->recent_ref_frame_usage[LAST_FRAME] +
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+ cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+ int pct_gf_active = (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+ // Reset the last boost indicator
+ //cpi->last_boost = 100;
+
+ if (tot_mbs)
+ gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+ if (pct_gf_active > gf_frame_useage)
+ gf_frame_useage = pct_gf_active;
+
+ // Is a fixed manual GF frequency being used
+ if (!cpi->auto_gold)
+ cpi->common.refresh_golden_frame = TRUE;
+ else
+ {
+ // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
+ if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
+ cpi->common.refresh_golden_frame = TRUE;
+
+ // Two pass GF descision
+ else if (cpi->pass == 2)
+ cpi->common.refresh_golden_frame = TRUE;
+ }
+
+#if 0
+
+ // Debug stats
+ if (0)
+ {
+ FILE *f;
+
+ f = fopen("gf_useaget.stt", "a");
+ fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+ cpi->common.current_video_frame, cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+ fclose(f);
+ }
+
+#endif
+
+ if (cpi->common.refresh_golden_frame == TRUE)
+ {
+ int isize_adjustment = 0;
+#if 0
+
+ if (0) // p_gw
+ {
+ FILE *f;
+
+ f = fopen("GFexit.stt", "a");
+ fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame);
+ fclose(f);
+ }
+
+#endif
+ cpi->initial_gf_use = 0;
+
+ if (cpi->auto_adjust_gold_quantizer)
+ {
+ calc_gf_params(cpi);
+ }
+
+ // If we are using alternate ref instead of gf then do not apply the boost
+ // It will instead be applied to the altref update
+ // Jims modified boost
+ if (!cpi->source_alt_ref_active)
+ {
+ if (cpi->oxcf.fixed_q < 0)
+ {
+ if (cpi->pass == 2)
+ {
+ cpi->this_frame_target = cpi->per_frame_bandwidth; // The spend on the GF is defined in the two pass code for two pass encodes
+ }
+ else
+ {
+ int Boost = cpi->last_boost;
+ int frames_in_section = cpi->frames_till_gf_update_due + 1;
+ int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+ int bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+ // Normalize Altboost and allocations chunck down to prevent overflow
+ while (Boost > 1000)
+ {
+ Boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ // Avoid loss of precision but avoid overflow
+ if ((bits_in_section >> 7) > allocation_chunks)
+ cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks);
+ else
+ cpi->this_frame_target = (Boost * bits_in_section) / allocation_chunks;
+ }
+ }
+ else
+ cpi->this_frame_target = (baseline_bits_at_q(1, Q, cpi->common.MBs) * cpi->last_boost) / 100;
+
+ }
+ // If there is an active ARF at this location use the minimum bits on this frame
+ else
+ {
+ cpi->this_frame_target = 0; // Minimial spend on gf that is replacing an arf
+ }
+
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+ }
+ }
+}
+
+
+void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
+{
+ int Q = cpi->common.base_qindex;
+ int correction_factor = 100;
+ double rate_correction_factor;
+ double adjustment_limit;
+
+ int projected_size_based_on_q = 0;
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp8_clear_system_state(); //__asm emms;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ rate_correction_factor = cpi->key_frame_rate_correction_factor;
+ }
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ rate_correction_factor = cpi->gf_rate_correction_factor;
+ else
+ rate_correction_factor = cpi->rate_correction_factor;
+ }
+
+ // Work out how big we would have expected the frame to be at this Q given the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ //projected_size_based_on_q = ((int)(.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) >> BPER_MB_NORMBITS;
+ projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+ // Make some allowance for cpi->zbin_over_quant
+ if (cpi->zbin_over_quant > 0)
+ {
+ int Z = cpi->zbin_over_quant;
+ double Factor = 0.99;
+ double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX;
+
+ while (Z > 0)
+ {
+ Z --;
+ projected_size_based_on_q *= (int)Factor;
+ Factor += factor_adjustment;
+
+ if (Factor >= 0.999)
+ Factor = 0.999;
+ }
+ }
+
+ // Work out a size correction factor.
+ //if ( cpi->this_frame_target > 0 )
+ // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
+ if (projected_size_based_on_q > 0)
+ correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+ // More heavily damped adjustment used if we have been oscillating either side of target
+ switch (damp_var)
+ {
+ case 0:
+ adjustment_limit = 0.75;
+ break;
+ case 1:
+ adjustment_limit = 0.375;
+ break;
+ case 2:
+ default:
+ adjustment_limit = 0.25;
+ break;
+ }
+
+ //if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
+ if (correction_factor > 102)
+ {
+ // We are not already at the worst allowable quality
+ correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ }
+ //else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
+ else if (correction_factor < 99)
+ {
+ // We are not already at the best allowable quality
+ correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ cpi->key_frame_rate_correction_factor = rate_correction_factor;
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ cpi->gf_rate_correction_factor = rate_correction_factor;
+ else
+ cpi->rate_correction_factor = rate_correction_factor;
+ }
+}
+
+static int estimate_bits_at_q(VP8_COMP *cpi, int Q)
+{
+ int Bpm = (int)(.5 + cpi->rate_correction_factor * vp8_bits_per_mb[INTER_FRAME][Q]);
+
+ /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+ * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+ * largest Bpm takes 20 bits.
+ */
+ if (cpi->common.MBs > (1 << 11))
+ return (Bpm >> BPER_MB_NORMBITS) * cpi->common.MBs;
+ else
+ return (Bpm * cpi->common.MBs) >> BPER_MB_NORMBITS;
+
+}
+
+
+int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
+{
+ int Q = cpi->active_worst_quality;
+
+ // Reset Zbin OQ value
+ cpi->zbin_over_quant = 0;
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ Q = cpi->oxcf.fixed_q;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ Q = cpi->oxcf.key_q;
+ }
+ else if (cpi->common.refresh_alt_ref_frame)
+ {
+ Q = cpi->oxcf.alt_q;
+ }
+ else if (cpi->common.refresh_golden_frame)
+ {
+ Q = cpi->oxcf.gold_q;
+ }
+
+ }
+ else
+ {
+ int i;
+ int last_error = INT_MAX;
+ int target_bits_per_mb;
+ int bits_per_mb_at_this_q;
+ double correction_factor;
+
+ // Select the appropriate correction factor based upon type of frame.
+ if (cpi->common.frame_type == KEY_FRAME)
+ correction_factor = cpi->key_frame_rate_correction_factor;
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ correction_factor = cpi->gf_rate_correction_factor;
+ else
+ correction_factor = cpi->rate_correction_factor;
+ }
+
+ // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+ if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+ target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int
+ else
+ target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+ i = cpi->active_best_quality;
+
+ do
+ {
+ bits_per_mb_at_this_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]);
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb)
+ {
+ if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+ Q = i;
+ else
+ Q = i - 1;
+
+ break;
+ }
+ else
+ last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+ }
+ while (++i <= cpi->active_worst_quality);
+
+
+ // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
+ // the RD multiplier and zero bin size.
+ if (Q >= MAXQ)
+ {
+ int zbin_oqmax;
+
+ double Factor = 0.99;
+ double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ zbin_oqmax = 0; //ZBIN_OQ_MAX/16
+ else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+ zbin_oqmax = 16;
+ else
+ zbin_oqmax = ZBIN_OQ_MAX;
+
+ /*{
+ double Factor = (double)target_bits_per_mb/(double)bits_per_mb_at_this_q;
+ double Oq;
+
+ Factor = Factor/1.2683;
+
+ Oq = pow( Factor, (1.0/-0.165) );
+
+ if ( Oq > zbin_oqmax )
+ Oq = zbin_oqmax;
+
+ cpi->zbin_over_quant = (int)Oq;
+ }*/
+
+ // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
+ // The effect will be highly clip dependent and may well have sudden steps.
+ // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
+ // bin and hence decreasing the number of low magnitude non zero coefficients.
+ while (cpi->zbin_over_quant < zbin_oqmax)
+ {
+ cpi->zbin_over_quant ++;
+
+ if (cpi->zbin_over_quant > zbin_oqmax)
+ cpi->zbin_over_quant = zbin_oqmax;
+
+ bits_per_mb_at_this_q *= (int)Factor; // Each over-ruin step is assumed to equate to approximately 3% reduction in bitrate
+ Factor += factor_adjustment;
+
+ if (Factor >= 0.999)
+ Factor = 0.999;
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb) // Break out if we get down to the target rate
+ break;
+ }
+
+ }
+ }
+
+ return Q;
+}
+
+static int estimate_min_frame_size(VP8_COMP *cpi)
+{
+ double correction_factor;
+ int bits_per_mb_at_max_q;
+
+ // This funtion returns a default value for the first few frames untill the correction factor has had time to adapt.
+ if (cpi->common.current_video_frame < 10)
+ {
+ if (cpi->pass == 2)
+ return (cpi->min_frame_bandwidth);
+ else
+ return cpi->per_frame_bandwidth / 3;
+ }
+
+ /* // Select the appropriate correction factor based upon type of frame.
+ if ( cpi->common.frame_type == KEY_FRAME )
+ correction_factor = cpi->key_frame_rate_correction_factor;
+ else
+ {
+ if ( cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame )
+ correction_factor = cpi->gf_rate_correction_factor;
+ else
+ correction_factor = cpi->rate_correction_factor;
+ }*/
+
+ // We estimate at half the value we get from vp8_bits_per_mb
+ correction_factor = cpi->rate_correction_factor / 2.0;
+
+ bits_per_mb_at_max_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][MAXQ]);
+
+ return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS;
+}
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+ int i;
+ int av_key_frames_per_second;
+
+ // Average key frame frequency and size
+ unsigned int total_weight = 0;
+ unsigned int av_key_frame_frequency = 0;
+ unsigned int av_key_frame_bits = 0;
+
+ unsigned int output_frame_rate = (unsigned int)(100 * cpi->output_frame_rate);
+ unsigned int target_bandwidth = (unsigned int)(100 * cpi->target_bandwidth);
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp8_clear_system_state(); //__asm emms;
+
+ // Update the count of total key frame bits
+ cpi->tot_key_frame_bits += cpi->projected_frame_size;
+
+ // First key frame at start of sequence is a special case. We have no frequency data.
+ if (cpi->key_frame_count == 1)
+ {
+ av_key_frame_frequency = (int)cpi->output_frame_rate * 2; // Assume a default of 1 kf every 2 seconds
+ av_key_frame_bits = cpi->projected_frame_size;
+ av_key_frames_per_second = output_frame_rate / av_key_frame_frequency; // Note output_frame_rate not cpi->output_frame_rate
+ }
+ else
+ {
+ // reset keyframe context and calculate weighted average of last KEY_FRAME_CONTEXT keyframes
+ for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+ {
+ if (i < KEY_FRAME_CONTEXT - 1)
+ {
+ cpi->prior_key_frame_size[i] = cpi->prior_key_frame_size[i+1];
+ cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i+1];
+ }
+ else
+ {
+ cpi->prior_key_frame_size[KEY_FRAME_CONTEXT - 1] = cpi->projected_frame_size;
+ cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] = cpi->frames_since_key;
+ }
+
+ av_key_frame_bits += prior_key_frame_weight[i] * cpi->prior_key_frame_size[i];
+ av_key_frame_frequency += prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i];
+ total_weight += prior_key_frame_weight[i];
+ }
+
+ av_key_frame_bits /= total_weight;
+ av_key_frame_frequency /= total_weight;
+ av_key_frames_per_second = output_frame_rate / av_key_frame_frequency;
+
+ }
+
+ // Do we have any key frame overspend to recover?
+ if ((cpi->pass != 2) && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+ {
+ // Update the count of key frame overspend to be recovered in subsequent frames
+ // A portion of the KF overspend is treated as gf overspend (and hence recovered more quickly)
+ // as the kf is also a gf. Otherwise the few frames following each kf tend to get more bits
+ // allocated than those following other gfs.
+ cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8;
+ cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8;
+
+ // Work out how much to try and recover per frame.
+ // For one pass we estimate the number of frames to spread it over based upon past history.
+ // For two pass we know how many frames there will be till the next kf.
+ if (cpi->pass == 2)
+ {
+ if (cpi->frames_to_key > 16)
+ cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)cpi->frames_to_key;
+ else
+ cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / 16;
+ }
+ else
+ cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)av_key_frame_frequency;
+ }
+
+ cpi->frames_since_key = 0;
+ cpi->last_key_frame_size = cpi->projected_frame_size;
+ cpi->key_frame_count++;
+}
+
+void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
+{
+ // Set-up bounds on acceptable frame size:
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ // Fixed Q scenario: frame size never outranges target (there is no target!)
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ }
+ else
+ {
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+ }
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+ }
+ else
+ {
+ // For CBR take buffer fullness into account
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1))
+ {
+ // Buffer is too full so relax overshoot and tighten undershoot
+ *frame_over_shoot_limit = cpi->this_frame_target * 12 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
+ }
+ else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1))
+ {
+ // Buffer is too low so relax undershoot and tighten overshoot
+ *frame_over_shoot_limit = cpi->this_frame_target * 10 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
+ }
+ else
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+ }
+ }
+ // VBR
+ // Note that tighter restrictions here can help quality but hurt encode speed
+ else
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+ }
+ }
+ }
+ }
+}
diff --git a/vp8/encoder/ratectrl.h b/vp8/encoder/ratectrl.h
new file mode 100644
index 000000000..588c7a823
--- /dev/null
+++ b/vp8/encoder/ratectrl.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#if !defined __INC_RATECTRL_H
+
+#include "onyx_int.h"
+
+extern void vp8_save_coding_context(VP8_COMP *cpi);
+extern void vp8_restore_coding_context(VP8_COMP *cpi);
+
+extern void vp8_setup_key_frame(VP8_COMP *cpi);
+extern void vp8_calc_iframe_target_size(VP8_COMP *cpi);
+extern void vp8_calc_pframe_target_size(VP8_COMP *cpi);
+extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
+extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
+extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
+extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
+
+#endif
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
new file mode 100644
index 000000000..084699628
--- /dev/null
+++ b/vp8/encoder/rdopt.c
@@ -0,0 +1,2212 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "pragmas.h"
+
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "entropymode.h"
+#include "reconinter.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "findnearmv.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "idct.h"
+#include "g_common.h"
+#include "variance.h"
+#include "mcomp.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "dct.h"
+#include "systemdependent.h"
+
+#define DIAMONDSEARCH 1
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+
+
+#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+/*int RDFUNC( int RM,int DM, int R, int D, int target_r )
+{
+ int rd_value;
+
+ rd_value = ( ((128+(R)*(RM)) >> 8) + (DM)*(D) );
+
+ return rd_value;
+}*/
+
+#define UVRDFUNC(RM,DM,R,D,target_r) RDFUNC(RM,DM,R,D,target_r)
+
+#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+#define MAXF(a,b) (((a) > (b)) ? (a) : (b))
+
+
+extern const TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+extern const TOKENEXTRA *vp8_dct_value_tokens_ptr;
+extern int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+extern int *vp8_dct_value_cost_ptr;
+
+
+const int vp8_auto_speed_thresh[17] =
+{
+ 1000,
+ 200,
+ 150,
+ 130,
+ 150,
+ 125,
+ 120,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 105
+};
+
+const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
+{
+ ZEROMV,
+ DC_PRED,
+
+ NEARESTMV,
+ NEARMV,
+
+ ZEROMV,
+ NEARESTMV,
+
+ ZEROMV,
+ NEARESTMV,
+
+ NEARMV,
+ NEARMV,
+
+ V_PRED,
+ H_PRED,
+ TM_PRED,
+
+ NEWMV,
+ NEWMV,
+ NEWMV,
+
+ SPLITMV,
+ SPLITMV,
+ SPLITMV,
+
+ B_PRED,
+};
+
+const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] =
+{
+ LAST_FRAME,
+ INTRA_FRAME,
+
+ LAST_FRAME,
+ LAST_FRAME,
+
+ GOLDEN_FRAME,
+ GOLDEN_FRAME,
+
+ ALTREF_FRAME,
+ ALTREF_FRAME,
+
+ GOLDEN_FRAME,
+ ALTREF_FRAME,
+
+ INTRA_FRAME,
+ INTRA_FRAME,
+ INTRA_FRAME,
+
+ LAST_FRAME,
+ GOLDEN_FRAME,
+ ALTREF_FRAME,
+
+ LAST_FRAME,
+ GOLDEN_FRAME,
+ ALTREF_FRAME,
+
+ INTRA_FRAME,
+};
+
+static void fill_token_costs(
+ unsigned int c [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens],
+ const vp8_prob p [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1]
+)
+{
+ int i, j, k;
+
+
+ for (i = 0; i < BLOCK_TYPES; i++)
+ for (j = 0; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+
+ vp8_cost_tokens((int *)(c [i][j][k]), p [i][j][k], vp8_coef_tree);
+
+}
+
+static int rd_iifactor [ 32 ] = { 16, 16, 16, 12, 8, 4, 2, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+
+
+
+// The values in this table should be reviewed
+static int sad_per_bit16lut[128] =
+{
+ 4, 4, 4, 4, 4, 4, 4, 4, // 4
+ 4, 4, 4, 4, 4, 4, 4, 4, // 1
+ 4, 4, 4, 4, 4, 4, 4, 4, // 2
+ 4, 4, 4, 4, 4, 4, 4, 4, // 3
+ 4, 4, 4, 4, 4, 4, 4, 4, // 4
+ 4, 4, 12, 12, 13, 13, 14, 14, // 5
+ 14, 14, 14, 15, 15, 15, 15, 15, // 6
+ 15, 15, 15, 15, 15, 15, 15, 15, // 7
+ 15, 15, 15, 15, 15, 16, 16, 16, // 8
+ 16, 16, 18, 18, 18, 18, 19, 19, // 9
+ 19, 19, 19, 19, 19, 19, 19, 19, // 10
+ 20, 20, 22, 22, 22, 22, 21, 21, // 11
+ 22, 22, 22, 22, 22, 22, 22, 22, // 12
+ 22, 22, 22, 22, 22, 22, 22, 22, // 13
+ 22, 22, 22, 22, 22, 22, 22, 22, // 14
+ 22, 22, 22, 22, 22, 22, 22, 22, // 15
+};
+
+static int sad_per_bit4lut[128] =
+{
+ 4, 4, 4, 4, 4, 4, 4, 4, // 4
+ 4, 4, 4, 4, 4, 4, 4, 4, // 1
+ 4, 4, 4, 4, 4, 4, 4, 4, // 2
+ 4, 4, 4, 4, 4, 4, 4, 4, // 3
+ 4, 4, 4, 4, 4, 4, 4, 4, // 4
+ 4, 4, 15, 15, 15, 15, 16, 16, // 5
+ 16, 17, 17, 17, 17, 17, 17, 17, // 6
+ 17, 17, 19, 19, 22, 22, 21, 21, // 7
+ 23, 23, 23, 23, 23, 24, 24, 24, // 8
+ 25, 25, 27, 27, 27, 27, 28, 28, // 9
+ 28, 28, 29, 29, 29, 29, 29, 29, // 10
+ 30, 30, 31, 31, 31, 31, 32, 32, // 11
+ 34, 34, 34, 34, 34, 34, 34, 34, // 12
+ 34, 34, 34, 34, 34, 34, 34, 34, // 13
+ 34, 34, 34, 34, 34, 34, 34, 34, // 14
+ 34, 34, 34, 34, 34, 34, 34, 34, // 15
+};
+
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
+{
+ cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex];
+ cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex];
+}
+
+void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
+{
+ int q;
+ int i;
+ int *thresh;
+ int threshmult;
+
+ int capped_q = (Qvalue < 160) ? Qvalue : 160;
+
+ vp8_clear_system_state(); //__asm emms;
+
+ cpi->RDMULT = (int)((0.00007 * (capped_q * capped_q * capped_q * capped_q)) - (0.0125 * (capped_q * capped_q * capped_q)) +
+ (2.25 * (capped_q * capped_q)) - (12.5 * capped_q) + 25.0);
+
+ if (cpi->RDMULT < 25)
+ cpi->RDMULT = 25;
+
+ if (cpi->pass == 2)
+ {
+ if (cpi->common.frame_type == KEY_FRAME)
+ cpi->RDMULT += (cpi->RDMULT * rd_iifactor[0]) / 16;
+ else if (cpi->next_iiratio > 31)
+ cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) / 16;
+ else
+ cpi->RDMULT += (cpi->RDMULT * rd_iifactor[cpi->next_iiratio]) / 16;
+ }
+
+
+ // Extend rate multiplier along side quantizer zbin increases
+ if (cpi->zbin_over_quant > 0)
+ {
+ // Extend rate multiplier along side quantizer zbin increases
+ if (cpi->zbin_over_quant > 0)
+ {
+ double oq_factor = pow(1.006, cpi->zbin_over_quant);
+
+ if (oq_factor > (1.0 + ((double)cpi->zbin_over_quant / 64.0)))
+ oq_factor = (1.0 + (double)cpi->zbin_over_quant / 64.0);
+
+ cpi->RDMULT *= (int)oq_factor;
+ }
+ }
+
+ cpi->mb.errorperbit = (cpi->RDMULT / 100);
+
+ if (cpi->mb.errorperbit < 1)
+ cpi->mb.errorperbit = 1;
+
+ vp8_set_speed_features(cpi);
+
+ if (cpi->common.simpler_lpf)
+ cpi->common.filter_type = SIMPLE_LOOPFILTER;
+
+ q = (int)pow(Qvalue, 1.25);
+
+ if (q < 8)
+ q = 8;
+
+ if (cpi->ref_frame_flags == VP8_ALT_FLAG)
+ {
+ thresh = &cpi->rd_threshes[THR_NEWA];
+ threshmult = cpi->sf.thresh_mult[THR_NEWA];
+ }
+ else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
+ {
+ thresh = &cpi->rd_threshes[THR_NEWG];
+ threshmult = cpi->sf.thresh_mult[THR_NEWG];
+ }
+ else
+ {
+ thresh = &cpi->rd_threshes[THR_NEWMV];
+ threshmult = cpi->sf.thresh_mult[THR_NEWMV];
+ }
+
+ if (cpi->RDMULT > 1000)
+ {
+ cpi->RDDIV = 1;
+ cpi->RDMULT /= 100;
+
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ if (cpi->sf.thresh_mult[i] < INT_MAX)
+ {
+ cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+ }
+ else
+ {
+ cpi->rd_threshes[i] = INT_MAX;
+ }
+
+ cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+ }
+ }
+ else
+ {
+ cpi->RDDIV = 100;
+
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
+ {
+ cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+ }
+ else
+ {
+ cpi->rd_threshes[i] = INT_MAX;
+ }
+
+ cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+ }
+ }
+
+ fill_token_costs(
+ cpi->mb.token_costs,
+ (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs
+ );
+
+ vp8_init_mode_costs(cpi);
+
+}
+
+void vp8_auto_select_speed(VP8_COMP *cpi)
+{
+ int used = cpi->oxcf.cpu_used;
+
+ int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+
+ milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+#if 0
+
+ if (0)
+ {
+ FILE *f;
+
+ f = fopen("speed.stt", "a");
+ fprintf(f, " %8ld %10ld %10ld %10ld\n",
+ cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time);
+ fclose(f);
+ }
+
+#endif
+
+ /*
+ // this is done during parameter valid check
+ if( used > 16)
+ used = 16;
+ if( used < -16)
+ used = -16;
+ */
+
+ if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress)
+ {
+ if (cpi->avg_pick_mode_time == 0)
+ {
+ cpi->Speed = 4;
+ }
+ else
+ {
+ if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95)
+ {
+ cpi->Speed += 2;
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+
+ if (cpi->Speed > 16)
+ {
+ cpi->Speed = 16;
+ }
+ }
+
+ if (milliseconds_for_compress * 100 > cpi->avg_encode_time * vp8_auto_speed_thresh[cpi->Speed])
+ {
+ cpi->Speed -= 1;
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+
+ // In real-time mode, cpi->speed is in [4, 16].
+ if (cpi->Speed < 4) //if ( cpi->Speed < 0 )
+ {
+ cpi->Speed = 4; //cpi->Speed = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ cpi->Speed += 4;
+
+ if (cpi->Speed > 16)
+ cpi->Speed = 16;
+
+
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+ }
+}
+
+int vp8_block_error_c(short *coeff, short *dqcoeff)
+{
+ int i;
+ int error = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ int this_diff = coeff[i] - dqcoeff[i];
+ error += this_diff * this_diff;
+ }
+
+ return error;
+}
+
+int vp8_mbblock_error_c(MACROBLOCK *mb, int dc)
+{
+ BLOCK *be;
+ BLOCKD *bd;
+ int i, j;
+ int berror, error = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ be = &mb->block[i];
+ bd = &mb->e_mbd.block[i];
+
+ berror = 0;
+
+ for (j = dc; j < 16; j++)
+ {
+ int this_diff = be->coeff[j] - bd->dqcoeff[j];
+ berror += this_diff * this_diff;
+ }
+
+ error += berror;
+ }
+
+ return error;
+}
+
+int vp8_mbuverror_c(MACROBLOCK *mb)
+{
+
+ BLOCK *be;
+ BLOCKD *bd;
+
+
+ int i;
+ int error = 0;
+
+ for (i = 16; i < 24; i++)
+ {
+ be = &mb->block[i];
+ bd = &mb->e_mbd.block[i];
+
+ error += vp8_block_error_c(be->coeff, bd->dqcoeff);
+ }
+
+ return error;
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static int macro_block_max_error(MACROBLOCK *mb)
+{
+ int error = 0;
+ int dc = 0;
+ BLOCK *be;
+ int i, j;
+ int berror;
+
+ dc = !(mb->e_mbd.mbmi.mode == B_PRED || mb->e_mbd.mbmi.mode == SPLITMV);
+
+ for (i = 0; i < 16; i++)
+ {
+ be = &mb->block[i];
+
+ berror = 0;
+
+ for (j = dc; j < 16; j++)
+ {
+ int this_diff = be->coeff[j];
+ berror += this_diff * this_diff;
+ }
+
+ error += berror;
+ }
+
+ for (i = 16; i < 24; i++)
+ {
+ be = &mb->block[i];
+ berror = 0;
+
+ for (j = 0; j < 16; j++)
+ {
+ int this_diff = be->coeff[j];
+ berror += this_diff * this_diff;
+ }
+
+ error += berror;
+ }
+
+ error <<= 2;
+
+ if (dc)
+ {
+ be = &mb->block[24];
+ berror = 0;
+
+ for (j = 0; j < 16; j++)
+ {
+ int this_diff = be->coeff[j];
+ berror += this_diff * this_diff;
+ }
+
+ error += berror;
+ }
+
+ error >>= 4;
+ return error;
+}
+#endif
+
+int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ unsigned char *uptr, *vptr;
+ unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+ unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+ int uv_stride = x->block[16].src_stride;
+
+ unsigned int sse1 = 0;
+ unsigned int sse2 = 0;
+ int mv_row;
+ int mv_col;
+ int offset;
+ int pre_stride = x->e_mbd.block[16].pre_stride;
+
+ vp8_build_uvmvs(&x->e_mbd, 0);
+ mv_row = x->e_mbd.block[16].bmi.mv.as_mv.row;
+ mv_col = x->e_mbd.block[16].bmi.mv.as_mv.col;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->e_mbd.pre.u_buffer + offset;
+ vptr = x->e_mbd.pre.v_buffer + offset;
+
+ if ((mv_row | mv_col) & 7)
+ {
+ VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+ VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+ sse2 += sse1;
+ }
+ else
+ {
+ VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+ VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+ sse2 += sse1;
+ }
+
+ return sse2;
+
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+ int c = !type; /* start at coef 0, unless Y with Y2 */
+ int eob = b->eob;
+ int pt ; /* surrounding block/prev coef predictor */
+ int cost = 0;
+ short *qcoeff_ptr = b->qcoeff;
+
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+
+ for (; c < eob; c++)
+ {
+ int v = QC(c);
+ int t = vp8_dct_value_tokens_ptr[v].Token;
+ cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+ cost += vp8_dct_value_cost_ptr[v];
+ pt = vp8_prev_token_class[t];
+ }
+
+# undef QC
+
+ if (c < 16)
+ cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+ pt = (c != !type); // is eob first coefficient;
+ *a = *l = pt;
+
+ return cost;
+}
+
+int vp8_rdcost_mby(MACROBLOCK *mb)
+{
+ int cost = 0;
+ int b;
+ TEMP_CONTEXT t, t2;
+ int type = 0;
+
+ MACROBLOCKD *x = &mb->e_mbd;
+
+ vp8_setup_temp_context(&t, x->above_context[Y1CONTEXT], x->left_context[Y1CONTEXT], 4);
+ vp8_setup_temp_context(&t2, x->above_context[Y2CONTEXT], x->left_context[Y2CONTEXT], 1);
+
+ if (x->mbmi.mode == SPLITMV)
+ type = 3;
+
+ for (b = 0; b < 16; b++)
+ cost += cost_coeffs(mb, x->block + b, type,
+ t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+
+ if (x->mbmi.mode != SPLITMV)
+ cost += cost_coeffs(mb, x->block + 24, 1,
+ t2.a + vp8_block2above[24], t2.l + vp8_block2left[24]);
+
+ return cost;
+}
+
+
+static void rd_pick_intra4x4block(
+ VP8_COMP *cpi,
+ MACROBLOCK *x,
+ BLOCK *be,
+ BLOCKD *b,
+ B_PREDICTION_MODE *best_mode,
+ B_PREDICTION_MODE above,
+ B_PREDICTION_MODE left,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+
+ int *bestrate,
+ int *bestratey,
+ int *bestdistortion)
+{
+ B_PREDICTION_MODE mode;
+ int best_rd = INT_MAX; // 1<<30
+ int rate = 0;
+ int distortion;
+ unsigned int *mode_costs;
+
+ ENTROPY_CONTEXT ta = *a, tempa = *a;
+ ENTROPY_CONTEXT tl = *l, templ = *l;
+
+
+ if (x->e_mbd.frame_type == KEY_FRAME)
+ {
+ mode_costs = x->bmode_costs[above][left];
+ }
+ else
+ {
+ mode_costs = x->inter_bmode_costs;
+ }
+
+ for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
+ {
+ int this_rd;
+ int ratey;
+
+ rate = mode_costs[mode];
+ vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, mode);
+
+ tempa = ta;
+ templ = tl;
+
+ ratey = cost_coeffs(x, b, 3, &tempa, &templ);
+ rate += ratey;
+ distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff) >> 2;
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ *a = tempa;
+ *l = templ;
+ }
+ }
+
+ b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
+ vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode);
+
+}
+
+
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion)
+{
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ int i;
+ TEMP_CONTEXT t;
+ int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+ int distortion = 0;
+ int tot_rate_y = 0;
+
+ vp8_intra_prediction_down_copy(xd);
+ vp8_setup_temp_context(&t, xd->above_context[Y1CONTEXT], xd->left_context[Y1CONTEXT], 4);
+
+ for (i = 0; i < 16; i++)
+ {
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+ const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
+ const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+ int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+ rd_pick_intra4x4block(
+ cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L,
+ t.a + vp8_block2above[i],
+ t.l + vp8_block2left[i], &r, &ry, &d);
+
+ cost += r;
+ distortion += d;
+ tot_rate_y += ry;
+ mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+ }
+
+ *Rate = cost;
+ *rate_y += tot_rate_y;
+ *Distortion = distortion;
+
+ return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, int *rate_y, int *Distortion)
+{
+
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ int rate, ratey;
+ unsigned int distortion;
+ int best_rd = INT_MAX;
+
+ //Y Search for 16x16 intra prediction mode
+ for (mode = DC_PRED; mode <= TM_PRED; mode++)
+ {
+ int this_rd;
+ int dummy;
+ rate = 0;
+
+ x->e_mbd.mbmi.mode = mode;
+
+ rate += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode];
+
+ vp8_encode_intra16x16mbyrd(IF_RTCD(&cpi->rtcd), x);
+
+ ratey = vp8_rdcost_mby(x);
+
+ rate += ratey;
+
+ VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride, &distortion, &dummy);
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ mode_selected = mode;
+ best_rd = this_rd;
+ *Rate = rate;
+ *rate_y = ratey;
+ *Distortion = (int)distortion;
+ }
+ }
+
+ x->e_mbd.mbmi.mode = mode_selected;
+ return best_rd;
+}
+
+
+static int rd_cost_mbuv(MACROBLOCK *mb)
+{
+ TEMP_CONTEXT t, t2;
+ int b;
+ int cost = 0;
+ MACROBLOCKD *x = &mb->e_mbd;
+
+ vp8_setup_temp_context(&t, x->above_context[UCONTEXT], x->left_context[UCONTEXT], 2);
+ vp8_setup_temp_context(&t2, x->above_context[VCONTEXT], x->left_context[VCONTEXT], 2);
+
+ for (b = 16; b < 20; b++)
+ cost += cost_coeffs(mb, x->block + b, vp8_block2type[b],
+ t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+
+ for (b = 20; b < 24; b++)
+ cost += cost_coeffs(mb, x->block + b, vp8_block2type[b],
+ t2.a + vp8_block2above[b], t2.l + vp8_block2left[b]);
+
+ return cost;
+}
+
+
+unsigned int vp8_get_mbuvrecon_error(const vp8_variance_rtcd_vtable_t *rtcd, const MACROBLOCK *x) // sum of squares
+{
+ unsigned int sse0, sse1;
+ int sum0, sum1;
+ VARIANCE_INVOKE(rtcd, get8x8var)(x->src.u_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, x->e_mbd.dst.uv_stride, &sse0, &sum0);
+ VARIANCE_INVOKE(rtcd, get8x8var)(x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride, &sse1, &sum1);
+ return (sse0 + sse1);
+}
+
+static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel)
+{
+ vp8_build_uvmvs(&x->e_mbd, fullpixel);
+ vp8_encode_inter16x16uvrd(IF_RTCD(&cpi->rtcd), x);
+
+
+ *rate = rd_cost_mbuv(x);
+ *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+
+ return UVRDFUNC(x->rdmult, x->rddiv, *rate, *distortion, cpi->target_bits_per_mb);
+}
+
+int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
+{
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ int best_rd = INT_MAX;
+ int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+ int rate_to;
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++)
+ {
+ int rate;
+ int distortion;
+ int this_rd;
+
+ x->e_mbd.mbmi.uv_mode = mode;
+ vp8_encode_intra16x16mbuvrd(IF_RTCD(&cpi->rtcd), x);
+
+ rate_to = rd_cost_mbuv(x);
+ rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.uv_mode];
+
+ distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x);
+
+ this_rd = UVRDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+
+ if (this_rd < best_rd)
+ {
+ best_rd = this_rd;
+ d = distortion;
+ r = rate;
+ *rate_tokenonly = rate_to;
+ mode_selected = mode;
+ }
+ }
+
+ *rate = r;
+ *distortion = d;
+
+ x->e_mbd.mbmi.uv_mode = mode_selected;
+ return best_rd;
+}
+#endif
+
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
+{
+ vp8_prob p [VP8_MVREFS-1];
+ assert(NEARESTMV <= m && m <= SPLITMV);
+ vp8_mv_ref_probs(p, near_mv_ref_ct);
+ return vp8_cost_token(vp8_mv_ref_tree, p, VP8_MVREFENCODINGS + m);
+}
+
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv)
+{
+ int i;
+
+ x->e_mbd.mbmi.mode = mb;
+ x->e_mbd.mbmi.mv.as_mv.row = mv->row;
+ x->e_mbd.mbmi.mv.as_mv.col = mv->col;
+
+ for (i = 0; i < 16; i++)
+ {
+ B_MODE_INFO *bmi = &x->e_mbd.block[i].bmi;
+ bmi->mode = (B_PREDICTION_MODE) mb;
+ bmi->mv.as_mv.row = mv->row;
+ bmi->mv.as_mv.col = mv->col;
+ }
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+int vp8_count_labels(int const *labelings)
+{
+ int i;
+ int count = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ if (labelings[i] > count)
+ count = labelings[i];
+ }
+
+ return count + 1;
+}
+
+
+static int labels2mode(
+ MACROBLOCK *x,
+ int const *labelings, int which_label,
+ B_PREDICTION_MODE this_mode,
+ MV *this_mv, MV *best_ref_mv,
+ int *mvcost[2]
+)
+{
+ MACROBLOCKD *const xd = & x->e_mbd;
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+
+ int cost = 0;
+ int thismvcost = 0;
+
+ /* We have to be careful retrieving previously-encoded motion vectors.
+ Ones from this macroblock have to be pulled from the BLOCKD array
+ as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+
+ int i = 0;
+
+ do
+ {
+ BLOCKD *const d = xd->block + i;
+ const int row = i >> 2, col = i & 3;
+
+ B_PREDICTION_MODE m;
+
+ if (labelings[i] != which_label)
+ continue;
+
+ if (col && labelings[i] == labelings[i-1])
+ m = LEFT4X4;
+ else if (row && labelings[i] == labelings[i-4])
+ m = ABOVE4X4;
+ else
+ {
+ // the only time we should do costing for new motion vector or mode
+ // is when we are on a new label (jbb May 08, 2007)
+ switch (m = this_mode)
+ {
+ case NEW4X4 :
+ thismvcost = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102);
+ break;
+ case LEFT4X4:
+ *this_mv = col ? d[-1].bmi.mv.as_mv : vp8_left_bmi(mic, i)->mv.as_mv;
+ break;
+ case ABOVE4X4:
+ *this_mv = row ? d[-4].bmi.mv.as_mv : vp8_above_bmi(mic, i, mis)->mv.as_mv;
+ break;
+ case ZERO4X4:
+ this_mv->row = this_mv->col = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (m == ABOVE4X4) // replace above with left if same
+ {
+ const MV mv = col ? d[-1].bmi.mv.as_mv : vp8_left_bmi(mic, i)->mv.as_mv;
+
+ if (mv.row == this_mv->row && mv.col == this_mv->col)
+ m = LEFT4X4;
+ }
+
+ cost = x->inter_bmode_costs[ m];
+ }
+
+ d->bmi.mode = m;
+ d->bmi.mv.as_mv = *this_mv;
+
+ }
+ while (++i < 16);
+
+ cost += thismvcost ;
+ return cost;
+}
+
+static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels, int which_label, TEMP_CONTEXT *t)
+{
+ int cost = 0;
+ int b;
+ MACROBLOCKD *x = &mb->e_mbd;
+
+
+ for (b = 0; b < 16; b++)
+ if (labels[ b] == which_label)
+ cost += cost_coeffs(mb, x->block + b, 3,
+ t->a + vp8_block2above[b],
+ t->l + vp8_block2left[b]);
+
+ return cost;
+
+}
+static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels, int which_label, const vp8_encodemb_rtcd_vtable_t *rtcd)
+{
+ int i;
+ unsigned int distortion = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ if (labels[i] == which_label)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+ BLOCK *be = &x->block[i];
+
+
+ vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict);
+ ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16);
+ x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+
+ // set to 0 no way to account for 2nd order DC so discount
+ //be->coeff[0] = 0;
+ x->quantize_brd(be, bd);
+
+ distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff);
+ }
+ }
+
+ return distortion;
+}
+
+static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp8_encodemb_rtcd_vtable_t *rtcd)
+{
+ int b;
+ MACROBLOCKD *const x = &mb->e_mbd;
+ BLOCK *const mb_y2 = mb->block + 24;
+ BLOCKD *const x_y2 = x->block + 24;
+ short *Y2DCPtr = mb_y2->src_diff;
+ BLOCK *beptr;
+ int d;
+
+ ENCODEMB_INVOKE(rtcd, submby)(mb->src_diff, mb->src.y_buffer, mb->e_mbd.predictor, mb->src.y_stride);
+
+ // Fdct and building the 2nd order block
+ for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
+ {
+ mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32);
+ *Y2DCPtr++ = beptr->coeff[0];
+ *Y2DCPtr++ = beptr->coeff[16];
+ }
+
+ // 2nd order fdct
+ if (x->mbmi.mode != SPLITMV)
+ {
+ mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+ }
+
+ // Quantization
+ for (b = 0; b < 16; b++)
+ {
+ mb->quantize_brd(&mb->block[b], &mb->e_mbd.block[b]);
+ }
+
+ // DC predication and Quantization of 2nd Order block
+ if (x->mbmi.mode != SPLITMV)
+ {
+
+ {
+ mb->quantize_brd(mb_y2, x_y2);
+ }
+ }
+
+ // Distortion
+ if (x->mbmi.mode == SPLITMV)
+ d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 0) << 2;
+ else
+ {
+ d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2;
+ d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff);
+ }
+
+ *Distortion = (d >> 4);
+
+ // rate
+ *Rate = vp8_rdcost_mby(mb);
+}
+
+static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *mdcounts, int *returntotrate, int *returnyrate, int *returndistortion, int compressor_speed, int *mvcost[2], int mvthresh, int fullpixel)
+{
+ int i, segmentation;
+ B_PREDICTION_MODE this_mode;
+ MACROBLOCKD *xc = &x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ BLOCK *c = &x->block[0];
+ BLOCKD *e = &x->e_mbd.block[0];
+ int const *labels;
+ int best_segment_rd = INT_MAX;
+ int best_seg = 0;
+ int br = 0;
+ int bd = 0;
+ int bsr = 0;
+ int bsd = 0;
+ int bestsegmentyrate = 0;
+
+ // FIX TO Rd error outrange bug PGW 9 june 2004
+ B_PREDICTION_MODE bmodes[16] = {ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
+ ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
+ ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
+ ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4
+ };
+
+ MV bmvs[16];
+ int beobs[16];
+
+ for (segmentation = 0; segmentation < VP8_NUMMBSPLITS; segmentation++)
+ {
+ int label_count;
+ int this_segment_rd = 0;
+ int label_mv_thresh;
+ int rate = 0;
+ int sbr = 0;
+ int sbd = 0;
+ int UNINITIALIZED_IS_SAFE(sseshift);
+ int segmentyrate = 0;
+
+ vp8_variance_fn_ptr_t v_fn_ptr;
+
+ TEMP_CONTEXT t;
+ TEMP_CONTEXT tb;
+ vp8_setup_temp_context(&t, xc->above_context[Y1CONTEXT], xc->left_context[Y1CONTEXT], 4);
+
+ br = 0;
+ bd = 0;
+
+ switch (segmentation)
+ {
+ case 0:
+ v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
+ v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
+ v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
+ v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
+ v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
+ sseshift = 3;
+ break;
+ case 1:
+ v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
+ v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
+ v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
+ v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
+ v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
+ sseshift = 3;
+ break;
+ case 2:
+ v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
+ v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
+ v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
+ v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
+ v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
+ sseshift = 2;
+ break;
+ case 3:
+ v_fn_ptr.vf = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
+ v_fn_ptr.svf = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
+ v_fn_ptr.sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
+ v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
+ v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
+ sseshift = 0;
+ break;
+ }
+
+ labels = vp8_mbsplits[segmentation];
+ label_count = vp8_count_labels(labels);
+
+ // 64 makes this threshold really big effectively
+ // making it so that we very rarely check mvs on
+ // segments. setting this to 1 would make mv thresh
+ // roughly equal to what it is for macroblocks
+ label_mv_thresh = 1 * mvthresh / label_count ;
+
+ // Segmentation method overheads
+ rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
+
+ rate += vp8_cost_mv_ref(SPLITMV, mdcounts);
+
+ this_segment_rd += RDFUNC(x->rdmult, x->rddiv, rate, 0, cpi->target_bits_per_mb);
+ br += rate;
+
+ for (i = 0; i < label_count; i++)
+ {
+ MV mode_mv[B_MODE_COUNT];
+ int best_label_rd = INT_MAX;
+ B_PREDICTION_MODE mode_selected = ZERO4X4;
+ int j;
+ int bestlabelyrate = 0;
+
+ b = &x->block[0];
+ d = &x->e_mbd.block[0];
+
+
+ // find first label
+ for (j = 0; j < 16; j++)
+ if (labels[j] == i)
+ break;
+
+ c = &x->block[j];
+ e = &x->e_mbd.block[j];
+
+ // search for the best motion vector on this segment
+ for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++)
+ {
+ int distortion;
+ int this_rd;
+ int num00;
+ int labelyrate;
+
+ TEMP_CONTEXT ts;
+ vp8_setup_temp_context(&ts, &t.a[0], &t.l[0], 4);
+
+ if (this_mode == NEW4X4)
+ {
+ int step_param = 0;
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+ int n;
+ int thissme;
+ int bestsme = INT_MAX;
+ MV temp_mv;
+
+ // Is the best so far sufficiently good that we cant justify doing and new motion search.
+ if (best_label_rd < label_mv_thresh)
+ break;
+
+ {
+ int sadpb = x->sadperbit4;
+
+ if (cpi->sf.search_method == HEX)
+ bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr.vf, v_fn_ptr.sdf, x->mvsadcost, mvcost);
+ else
+ {
+ bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEW4X4].row = temp_mv.row;
+ mode_mv[NEW4X4].col = temp_mv.col;
+ }
+ }
+ }
+ }
+
+ // Should we do a full search (best quality only)
+ if ((compressor_speed == 0) && (bestsme >> sseshift) > 4000)
+ {
+ thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, &v_fn_ptr, x->mvcost, x->mvsadcost);
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEW4X4] = e->bmi.mv.as_mv;
+ }
+ else
+ {
+ // The full search result is actually worse so re-instate the previous best vector
+ e->bmi.mv.as_mv = mode_mv[NEW4X4];
+ }
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ {
+ if (!fullpixel)
+ cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+ else
+ vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+ }
+ }
+
+ rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode], best_ref_mv, mvcost);
+
+ // Trap vectors that reach beyond the UMV borders
+ if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max))
+ {
+ continue;
+ }
+
+ distortion = vp8_encode_inter_mb_segment(x, labels, i, IF_RTCD(&cpi->rtcd.encodemb)) / 4;
+
+ labelyrate = rdcost_mbsegment_y(x, labels, i, &ts);
+ rate += labelyrate;
+
+ this_rd = RDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+
+ if (this_rd < best_label_rd)
+ {
+ sbr = rate;
+ sbd = distortion;
+ bestlabelyrate = labelyrate;
+ mode_selected = this_mode;
+ best_label_rd = this_rd;
+ vp8_setup_temp_context(&tb, &ts.a[0], &ts.l[0], 4);
+
+ }
+ }
+
+ vp8_setup_temp_context(&t, &tb.a[0], &tb.l[0], 4);
+
+ labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], best_ref_mv, mvcost);
+
+ br += sbr;
+ bd += sbd;
+ segmentyrate += bestlabelyrate;
+ this_segment_rd += best_label_rd;
+
+ if ((this_segment_rd > best_rd) || (this_segment_rd > best_segment_rd))
+ break;
+ }
+
+ if ((this_segment_rd <= best_rd) && (this_segment_rd < best_segment_rd))
+ {
+ bsr = br;
+ bsd = bd;
+ bestsegmentyrate = segmentyrate;
+ best_segment_rd = this_segment_rd;
+ best_seg = segmentation;
+
+ // store everything needed to come back to this!!
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+
+ bmvs[i] = bd->bmi.mv.as_mv;
+ bmodes[i] = bd->bmi.mode;
+ beobs[i] = bd->eob;
+ }
+ }
+ }
+
+ // set it to the best
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+
+ bd->bmi.mv.as_mv = bmvs[i];
+ bd->bmi.mode = bmodes[i];
+ bd->eob = beobs[i];
+ }
+
+ // Trap cases where the best split mode has all vectors coded 0,0 (or all the same)
+ if (FALSE)
+ {
+ int allsame = 1;
+
+ for (i = 1; i < 16; i++)
+ {
+ if ((bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row))
+ {
+ allsame = 0;
+ break;
+ }
+ }
+
+ if (allsame)
+ {
+ best_segment_rd = INT_MAX;
+ }
+ }
+
+ *returntotrate = bsr;
+ *returndistortion = bsd;
+ *returnyrate = bestsegmentyrate;
+
+
+
+ // save partitions
+ labels = vp8_mbsplits[best_seg];
+ x->e_mbd.mbmi.partitioning = best_seg;
+ x->e_mbd.mbmi.partition_count = vp8_count_labels(labels);
+
+ for (i = 0; i < x->e_mbd.mbmi.partition_count; i++)
+ {
+ int j;
+
+ for (j = 0; j < 16; j++)
+ {
+ if (labels[j] == i)
+ break;
+ }
+
+ x->e_mbd.mbmi.partition_bmi[i].mode = x->e_mbd.block[j].bmi.mode;
+ x->e_mbd.mbmi.partition_bmi[i].mv.as_mv = x->e_mbd.block[j].bmi.mv.as_mv;
+ }
+
+ return best_segment_rd;
+}
+
+
+int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+{
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+ B_MODE_INFO best_bmodes[16];
+ MB_MODE_INFO best_mbmode;
+ MV best_ref_mv;
+ MV mode_mv[MB_MODE_COUNT];
+ MB_PREDICTION_MODE this_mode;
+ int num00;
+ int best_mode_index = 0;
+
+ int i;
+ int mode_index;
+ int mdcounts[4];
+ int rate;
+ int distortion;
+ int best_rd = INT_MAX; // 1 << 30;
+ int ref_frame_cost[MAX_REF_FRAMES];
+ int rate2, distortion2;
+ int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+ int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
+
+ //int all_rds[MAX_MODES]; // Experimental debug code.
+ //int all_rates[MAX_MODES];
+ //int all_dist[MAX_MODES];
+ //int intermodecost[MAX_MODES];
+
+ MB_PREDICTION_MODE uv_intra_mode;
+ int sse;
+ int sum;
+ int uvintra_eob = 0;
+ int tteob = 0;
+ int force_no_skip = 0;
+
+ *returnintra = INT_MAX;
+
+ cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame
+
+ x->skip = 0;
+
+ ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded);
+
+ // Experimental code
+ // Adjust the RD multiplier based on the best case distortion we saw in the most recently coded mb
+ //if ( (cpi->last_mb_distortion) > 0 && (cpi->target_bits_per_mb > 0) )
+ /*{
+ int tmprdmult;
+
+ //tmprdmult = (cpi->last_mb_distortion * 256) / ((cpi->av_per_frame_bandwidth*256)/cpi->common.MBs);
+ tmprdmult = (cpi->last_mb_distortion * 256) / cpi->target_bits_per_mb;
+ //tmprdmult = tmprdmult;
+
+ //if ( tmprdmult > cpi->RDMULT * 2 )
+ // tmprdmult = cpi->RDMULT * 2;
+ //else if ( tmprdmult < cpi->RDMULT / 2 )
+ // tmprdmult = cpi->RDMULT / 2;
+
+ //tmprdmult = (tmprdmult < 25) ? 25 : tmprdmult;
+
+ //x->rdmult = tmprdmult;
+
+ }*/
+
+ // Special case treatment when GF and ARF are not sensible options for reference
+ if (cpi->ref_frame_flags == VP8_LAST_FLAG)
+ {
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(255);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(255)
+ + vp8_cost_zero(128);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(255)
+ + vp8_cost_one(128);
+ }
+ else
+ {
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(cpi->prob_last_coded);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(cpi->prob_last_coded)
+ + vp8_cost_zero(cpi->prob_gf_coded);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(cpi->prob_last_coded)
+ + vp8_cost_one(cpi->prob_gf_coded);
+ }
+
+ vpx_memset(mode_mv, 0, sizeof(mode_mv));
+
+ x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+ vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
+ uv_intra_mode = x->e_mbd.mbmi.uv_mode;
+ {
+ uvintra_eob = 0;
+
+ for (i = 16; i < 24; i++)
+ uvintra_eob += x->e_mbd.block[i].eob;
+ }
+
+ for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+ {
+ int frame_cost;
+ int this_rd = INT_MAX;
+ int lf_or_gf = 0; // Lat Frame (01) or gf/arf (1)
+ int disable_skip = 0;
+
+ force_no_skip = 0;
+
+ // Experimental debug code.
+ // Record of rd values recorded for this MB. -1 indicates not measured
+ //all_rds[mode_index] = -1;
+ //all_rates[mode_index] = -1;
+ //all_dist[mode_index] = -1;
+ //intermodecost[mode_index] = -1;
+
+ // Test best rd so far against threshold for trying this mode.
+ if (best_rd <= cpi->rd_threshes[mode_index])
+ continue;
+
+
+
+ // These variables hold are rolling total cost and distortion for this mode
+ rate2 = 0;
+ distortion2 = 0;
+
+ // Where skip is allowable add in the default per mb cost for the no skip case.
+ // where we then decide to skip we have to delete this and replace it with the
+ // cost of signallying a skip
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ rate2 += vp8_cost_bit(cpi->prob_skip_false, 0);
+ }
+
+ this_mode = vp8_mode_order[mode_index];
+
+ x->e_mbd.mbmi.mode = this_mode;
+ x->e_mbd.mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+
+ //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
+ if (cpi->is_src_frame_alt_ref)
+ {
+ if (this_mode != ZEROMV || x->e_mbd.mbmi.ref_frame != ALTREF_FRAME)
+ continue;
+ }
+
+ if (x->e_mbd.mbmi.ref_frame == LAST_FRAME)
+ {
+ if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+ continue;
+
+ lf_or_gf = 0; // Local last frame vs Golden frame flag
+
+ // Set up pointers for this macro block into the previous frame recon buffer
+ x->e_mbd.pre.y_buffer = cpi->common.last_frame.y_buffer + recon_yoffset;
+ x->e_mbd.pre.u_buffer = cpi->common.last_frame.u_buffer + recon_uvoffset;
+ x->e_mbd.pre.v_buffer = cpi->common.last_frame.v_buffer + recon_uvoffset;
+ }
+ else if (x->e_mbd.mbmi.ref_frame == GOLDEN_FRAME)
+ {
+
+ // not supposed to reference gold frame
+ if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+ continue;
+
+ lf_or_gf = 1; // Local last frame vs Golden frame flag
+
+ // Set up pointers for this macro block into the previous frame recon buffer
+ x->e_mbd.pre.y_buffer = cpi->common.golden_frame.y_buffer + recon_yoffset;
+ x->e_mbd.pre.u_buffer = cpi->common.golden_frame.u_buffer + recon_uvoffset;
+ x->e_mbd.pre.v_buffer = cpi->common.golden_frame.v_buffer + recon_uvoffset;
+ }
+ else if (x->e_mbd.mbmi.ref_frame == ALTREF_FRAME)
+ {
+ // not supposed to reference alt ref frame
+ if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+ continue;
+
+ //if ( !cpi->source_alt_ref_active )
+ // continue;
+
+ lf_or_gf = 1; // Local last frame vs Golden frame flag
+
+ // Set up pointers for this macro block into the previous frame recon buffer
+ x->e_mbd.pre.y_buffer = cpi->common.alt_ref_frame.y_buffer + recon_yoffset;
+ x->e_mbd.pre.u_buffer = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset;
+ x->e_mbd.pre.v_buffer = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset;
+ }
+
+ vp8_find_near_mvs(&x->e_mbd,
+ x->e_mbd.mode_info_context,
+ &mode_mv[NEARESTMV], &mode_mv[NEARMV], &best_ref_mv,
+ mdcounts, x->e_mbd.mbmi.ref_frame, cpi->common.ref_frame_sign_bias);
+
+
+ // Estimate the reference frame signaling cost and add it to the rolling cost variable.
+ frame_cost = ref_frame_cost[x->e_mbd.mbmi.ref_frame];
+ rate2 += frame_cost;
+
+ if (this_mode <= B_PRED)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
+ }
+ }
+
+ // Check to see if the testing frequency for this mode is at its max
+ // If so then prevent it from being tested and increase the threshold for its testing
+ if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+ {
+ if (cpi->mbs_tested_so_far <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
+ {
+ // Increase the threshold for coding this mode to make it less likely to be chosen
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+
+ continue;
+ }
+ }
+
+ // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+ cpi->mode_test_hit_counts[mode_index] ++;
+
+ // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
+ if (cpi->zbin_mode_boost_enabled)
+ {
+ if ((vp8_mode_order[mode_index] == ZEROMV) && (vp8_ref_frame_order[mode_index] != LAST_FRAME))
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = 0;
+
+ vp8cx_mb_init_quantizer(cpi, x);
+ }
+
+ switch (this_mode)
+ {
+ case B_PRED:
+
+ // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
+ vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion);
+ rate2 += rate;
+ //rate_y = rate;
+ distortion2 += distortion;
+ rate2 += uv_intra_rate;
+ rate_uv = uv_intra_rate_tokenonly;
+ distortion2 += uv_intra_distortion;
+ break;
+
+ case SPLITMV:
+ {
+ int frame_cost_rd = RDFUNC(x->rdmult, x->rddiv, frame_cost, 0, cpi->target_bits_per_mb);
+ int saved_rate = rate2;
+
+ // vp8_rd_pick_best_mbsegmentation looks only at Y and does not account for frame_cost.
+ // (best_rd - frame_cost_rd) is thus a conservative breakout number.
+ int breakout_rd = best_rd - frame_cost_rd;
+ int tmp_rd;
+
+ if (x->e_mbd.mbmi.ref_frame == LAST_FRAME)
+ tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWMV], cpi->common.full_pixel) ;
+ else if (x->e_mbd.mbmi.ref_frame == GOLDEN_FRAME)
+ tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWG], cpi->common.full_pixel) ;
+ else
+ tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWA], cpi->common.full_pixel) ;
+
+ rate2 += rate;
+ distortion2 += distortion;
+
+ // If even the 'Y' rd value of split is higher than best so far then dont bother looking at UV
+ if (tmp_rd < breakout_rd)
+ {
+ // Now work out UV cost and add it in
+ vp8_rd_inter_uv(cpi, x, &rate, &distortion, cpi->common.full_pixel);
+ rate2 += rate;
+ rate_uv = rate;
+ distortion2 += distortion;
+
+ }
+ else
+ {
+ this_rd = INT_MAX;
+ disable_skip = 1;
+ }
+
+ // Trap cases where the best split mode has all vectors coded 0,0 (or all the same)
+ if (0)
+ {
+ int allsame = 1;
+
+ for (i = 1; i < 16; i++)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+
+ if (bd->bmi.mv.as_int != x->e_mbd.block[0].bmi.mv.as_int) //(bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row ) )
+ {
+ allsame = 0;
+ break;
+ }
+ }
+
+ if (allsame)
+ {
+ // reset mode and mv and jump to newmv
+ this_mode = NEWMV;
+ distortion2 = 0;
+ rate2 = saved_rate;
+ mode_mv[NEWMV].row = x->e_mbd.block[0].bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = x->e_mbd.block[0].bmi.mv.as_mv.col;
+ rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+ goto mv_selected;
+ }
+ }
+
+ // trap cases where the 8x8s can be promoted to 8x16s or 16x8s
+ if (0)//x->e_mbd.mbmi.partition_count == 4)
+ {
+
+ if (x->e_mbd.mbmi.partition_bmi[0].mv.as_int == x->e_mbd.mbmi.partition_bmi[1].mv.as_int
+ && x->e_mbd.mbmi.partition_bmi[2].mv.as_int == x->e_mbd.mbmi.partition_bmi[3].mv.as_int)
+ {
+ const int *labels = vp8_mbsplits[2];
+ x->e_mbd.mbmi.partitioning = 0;
+ rate -= vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + 2);
+ rate += vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings);
+ //rate -= x->inter_bmode_costs[ x->e_mbd.mbmi.partition_bmi[1]];
+ //rate -= x->inter_bmode_costs[ x->e_mbd.mbmi.partition_bmi[3]];
+ x->e_mbd.mbmi.partition_bmi[1] = x->e_mbd.mbmi.partition_bmi[2];
+ }
+ }
+
+ }
+ break;
+ case DC_PRED:
+ case V_PRED:
+ case H_PRED:
+ case TM_PRED:
+ x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+ vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ {
+ macro_block_yrd(x, &rate, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
+ rate2 += rate;
+ rate_y = rate;
+ distortion2 += distortion;
+ rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode];
+ rate2 += uv_intra_rate;
+ rate_uv = uv_intra_rate_tokenonly;
+ distortion2 += uv_intra_distortion;
+ }
+ break;
+
+ case NEWMV:
+
+ // Decrement full search counter
+ if (cpi->check_freq[lf_or_gf] > 0)
+ cpi->check_freq[lf_or_gf] --;
+
+ {
+ int thissme;
+ int bestsme = INT_MAX;
+ int step_param = cpi->sf.first_step;
+ int search_range;
+ int further_steps;
+ int n;
+
+ // Work out how long a search we should do
+ search_range = MAXF(abs(best_ref_mv.col), abs(best_ref_mv.row)) >> 3;
+
+ if (search_range >= x->vector_range)
+ x->vector_range = search_range;
+ else if (x->vector_range > cpi->sf.min_fs_radius)
+ x->vector_range--;
+
+ // Initial step/diamond search
+ {
+ int sadpb = x->sadperbit16;
+
+ if (cpi->sf.search_method == HEX)
+ {
+ bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+ }
+ else
+ {
+ bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+ // Further step/diamond searches as necessary
+ n = 0;
+ further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+ }
+ else
+ {
+ d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+ d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+ }
+ }
+ }
+ }
+
+ }
+
+ // Should we do a full search
+ if (!cpi->check_freq[lf_or_gf] || cpi->do_full[lf_or_gf])
+ {
+ int thissme;
+ int full_flag_thresh = 0;
+
+ // Update x->vector_range based on best vector found in step search
+ search_range = MAXF(abs(d->bmi.mv.as_mv.row), abs(d->bmi.mv.as_mv.col));
+
+ if (search_range > x->vector_range)
+ x->vector_range = search_range;
+ else
+ search_range = x->vector_range;
+
+ // Apply limits
+ search_range = (search_range > cpi->sf.max_fs_radius) ? cpi->sf.max_fs_radius : search_range;
+ {
+ int sadpb = x->sadperbit16 >> 2;
+ thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr, x->mvcost, x->mvsadcost);
+ }
+
+ // Barrier threshold to initiating full search
+ // full_flag_thresh = 10 + (thissme >> 7);
+ if ((thissme + full_flag_thresh) < bestsme)
+ {
+ cpi->do_full[lf_or_gf] ++;
+ bestsme = thissme;
+ }
+ else if (thissme < bestsme)
+ bestsme = thissme;
+ else
+ {
+ cpi->do_full[lf_or_gf] = cpi->do_full[lf_or_gf] >> 1;
+ cpi->check_freq[lf_or_gf] = cpi->sf.full_freq[lf_or_gf];
+
+ // The full search result is actually worse so re-instate the previous best vector
+ d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+ d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost); // normal mvc=11
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, cpi->fn_ptr.svf, cpi->fn_ptr.vf, x->mvcost);
+
+ mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+ mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+ // Add the new motion vector cost to our rolling cost variable
+ rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+
+ }
+
+ case NEARESTMV:
+ case NEARMV:
+
+ // Clip "next_nearest" so that it does not extend to far out of image
+ if (mode_mv[this_mode].col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+ mode_mv[this_mode].col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+ else if (mode_mv[this_mode].col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+ mode_mv[this_mode].col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+ if (mode_mv[this_mode].row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+ mode_mv[this_mode].row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+ else if (mode_mv[this_mode].row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+ mode_mv[this_mode].row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+
+ // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+ if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) &&
+ ((mode_mv[this_mode].row == 0) && (mode_mv[this_mode].col == 0)))
+ continue;
+
+ case ZEROMV:
+
+ mv_selected:
+
+ // Trap vectors that reach beyond the UMV borders
+ // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+ // because of the lack of break statements in the previous two cases.
+ if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max))
+ continue;
+
+ vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+ vp8_build_inter_predictors_mby(&x->e_mbd);
+ VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);
+
+ if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+ {
+ x->skip = 1;
+ }
+ else if (sse < x->encode_breakout)
+ {
+ // Check u and v to make sure skip is ok
+ int sse2 = 0;
+
+ sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
+
+ if (sse2 * 2 < x->encode_breakout)
+ {
+ x->skip = 1;
+ distortion2 = sse;
+ rate2 = 500;
+
+ disable_skip = 1; // We have no real rate data so trying to adjust for rate_y and rate_uv below will cause problems.
+ this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+
+ break; // (PGW) Move break here from below - for now at least
+ }
+ else
+ x->skip = 0;
+ }
+
+ //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts); // Experimental debug code
+
+ // Add in the Mv/mode cost
+ rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+ // Y cost and distortion
+ macro_block_yrd(x, &rate, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
+ rate2 += rate;
+ rate_y = rate;
+ distortion2 += distortion;
+
+ // UV cost and distortion
+ vp8_rd_inter_uv(cpi, x, &rate, &distortion, cpi->common.full_pixel);
+ rate2 += rate;
+ rate_uv = rate;
+ distortion2 += distortion;
+ break;
+
+ default:
+ break;
+ }
+
+ if (!disable_skip)
+ {
+ // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ tteob = 0;
+
+ for (i = 0; i <= 24; i++)
+ {
+ tteob += x->e_mbd.block[i].eob;
+ }
+
+ if (tteob == 0)
+ {
+#if 1
+ rate2 -= (rate_y + rate_uv);
+
+ // Back out no skip flag costing and add in skip flag costing
+ if (cpi->prob_skip_false)
+ {
+ rate2 += vp8_cost_bit(cpi->prob_skip_false, 1);
+ rate2 -= vp8_cost_bit(cpi->prob_skip_false, 0);
+ }
+
+#else
+ int rateuseskip;
+ int ratenotuseskip;
+
+
+
+ ratenotuseskip = rate_y + rate_uv + vp8_cost_bit(cpi->prob_skip_false, 0);
+ rateuseskip = vp8_cost_bit(cpi->prob_skip_false, 1);
+
+ if (1) // rateuseskip<ratenotuseskip)
+ {
+ rate2 -= ratenotuseskip;
+ rate2 += rateuseskip;
+ force_no_skip = 0;
+ }
+ else
+ {
+ force_no_skip = 1;
+ }
+
+#endif
+ }
+
+#if 0
+ else
+ {
+ int rateuseskip;
+ int ratenotuseskip;
+ int maxdistortion;
+ int minrate;
+ int skip_rd;
+
+ // distortion when no coeff is encoded
+ maxdistortion = macro_block_max_error(x);
+
+ ratenotuseskip = rate_y + rate_uv + vp8_cost_bit(cpi->prob_skip_false, 0);
+ rateuseskip = vp8_cost_bit(cpi->prob_skip_false, 1);
+
+ minrate = rateuseskip - ratenotuseskip;
+
+ skip_rd = RDFUNC(x->rdmult, x->rddiv, minrate, maxdistortion - distortion2, cpi->target_bits_per_mb);
+
+ if (skip_rd + 50 < 0 && x->e_mbd.mbmi.ref_frame != INTRA_FRAME && rate_y + rate_uv < 4000)
+ {
+ force_no_skip = 1;
+ rate2 = rate2 + rateuseskip - ratenotuseskip;
+ distortion2 = maxdistortion;
+ }
+ else
+ {
+ force_no_skip = 0;
+ }
+
+ }
+
+#endif
+
+ }
+
+ // Calculate the final RD estimate for this mode
+ this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+ }
+
+ // Experimental debug code.
+ //all_rds[mode_index] = this_rd;
+ //all_rates[mode_index] = rate2;
+ //all_dist[mode_index] = distortion2;
+
+ if ((x->e_mbd.mbmi.ref_frame == INTRA_FRAME) && (this_rd < *returnintra))
+ {
+ *returnintra = this_rd ;
+ }
+
+ // Did this mode help.. i.i is it the new best mode
+ if (this_rd < best_rd || x->skip)
+ {
+ // Note index of best mode so far
+ best_mode_index = mode_index;
+ x->e_mbd.mbmi.force_no_skip = force_no_skip;
+
+ if (this_mode <= B_PRED)
+ {
+ x->e_mbd.mbmi.uv_mode = uv_intra_mode;
+ }
+
+ *returnrate = rate2;
+ *returndistortion = distortion2;
+ best_rd = this_rd;
+ vpx_memcpy(&best_mbmode, &x->e_mbd.mbmi, sizeof(MB_MODE_INFO));
+
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memcpy(&best_bmodes[i], &x->e_mbd.block[i].bmi, sizeof(B_MODE_INFO));
+ }
+
+ // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+ cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+ else
+ {
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ if (x->skip)
+ break;
+ }
+
+ // Reduce the activation RD thresholds for the best choice mode
+ if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+ {
+ int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+ cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+
+ // If we chose a split mode then reset the new MV thresholds as well
+ /*if ( vp8_mode_order[best_mode_index] == SPLITMV )
+ {
+ best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWMV] >> 4);
+ cpi->rd_thresh_mult[THR_NEWMV] = (cpi->rd_thresh_mult[THR_NEWMV] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWMV]-best_adjustment: MIN_THRESHMULT;
+ cpi->rd_threshes[THR_NEWMV] = (cpi->rd_baseline_thresh[THR_NEWMV] >> 7) * cpi->rd_thresh_mult[THR_NEWMV];
+
+ best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWG] >> 4);
+ cpi->rd_thresh_mult[THR_NEWG] = (cpi->rd_thresh_mult[THR_NEWG] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWG]-best_adjustment: MIN_THRESHMULT;
+ cpi->rd_threshes[THR_NEWG] = (cpi->rd_baseline_thresh[THR_NEWG] >> 7) * cpi->rd_thresh_mult[THR_NEWG];
+
+ best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWA] >> 4);
+ cpi->rd_thresh_mult[THR_NEWA] = (cpi->rd_thresh_mult[THR_NEWA] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWA]-best_adjustment: MIN_THRESHMULT;
+ cpi->rd_threshes[THR_NEWA] = (cpi->rd_baseline_thresh[THR_NEWA] >> 7) * cpi->rd_thresh_mult[THR_NEWA];
+ }*/
+
+ }
+
+ // If we have chosen new mv or split then decay the full search check count more quickly.
+ if ((vp8_mode_order[best_mode_index] == NEWMV) || (vp8_mode_order[best_mode_index] == SPLITMV))
+ {
+ int lf_or_gf = (vp8_ref_frame_order[best_mode_index] == LAST_FRAME) ? 0 : 1;
+
+ if (cpi->check_freq[lf_or_gf] && !cpi->do_full[lf_or_gf])
+ {
+ cpi->check_freq[lf_or_gf] --;
+ }
+ }
+
+ // Keep a record of best mode index that we chose
+ cpi->last_best_mode_index = best_mode_index;
+
+ // Note how often each mode chosen as best
+ cpi->mode_chosen_counts[best_mode_index] ++;
+
+
+ if (cpi->is_src_frame_alt_ref && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+ {
+ best_mbmode.mode = ZEROMV;
+ best_mbmode.ref_frame = ALTREF_FRAME;
+ best_mbmode.mv.as_int = 0;
+ best_mbmode.uv_mode = 0;
+ best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+ best_mbmode.partitioning = 0;
+ best_mbmode.dc_diff = 0;
+
+ vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
+ }
+
+ x->e_mbd.mbmi.mv.as_int = 0;
+
+ return best_rd;
+ }
+
+
+ // macroblock modes
+ vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+ for (i = 0; i < 16; i++)
+ {
+ vpx_memcpy(&x->e_mbd.block[i].bmi, &best_bmodes[i], sizeof(B_MODE_INFO));
+ }
+
+ x->e_mbd.mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv;
+
+ return best_rd;
+}
+#endif
+
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
new file mode 100644
index 000000000..c6eae4b92
--- /dev/null
+++ b/vp8/encoder/rdopt.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RDOPT_H
+#define __INC_RDOPT_H
+void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);
+int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
+int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);
+extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+
+
+#endif
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
new file mode 100644
index 000000000..74c6bd76a
--- /dev/null
+++ b/vp8/encoder/sad_c.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+unsigned int vp8_sad16x16_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad)
+{
+
+ int r, c;
+ unsigned int sad = 0;
+
+ for (r = 0; r < 16; r++)
+ {
+ for (c = 0; c < 16; c++)
+ {
+ sad += abs(src_ptr[c] - ref_ptr[c]);
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ return sad;
+}
+
+
+static __inline
+unsigned int sad_mx_n_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int m,
+ int n)
+{
+
+ int r, c;
+ unsigned int sad = 0;
+
+ for (r = 0; r < n; r++)
+ {
+ for (c = 0; c < m; c++)
+ {
+ sad += abs(src_ptr[c] - ref_ptr[c]);
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ return sad;
+}
+
+
+unsigned int vp8_sad8x8_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad)
+{
+
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
+}
+
+
+unsigned int vp8_sad16x8_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad)
+{
+
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
+
+}
+
+
+unsigned int vp8_sad8x16_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad)
+{
+
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
+}
+
+
+unsigned int vp8_sad4x4_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad)
+{
+
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
+}
+
+void vp8_sad16x16x3_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad16x8x3_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x8x3_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x16x3_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad4x4x3_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad16x16x4d_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad16x8x4d_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x8x4d_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x16x4d_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad4x4x4d_c(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+)
+{
+ sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
new file mode 100644
index 000000000..df214a89f
--- /dev/null
+++ b/vp8/encoder/ssim.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+
+#define C1 (float)(64 * 64 * 0.01*255*0.01*255)
+#define C2 (float)(64 * 64 * 0.03*255*0.03*255)
+
+static int width_y;
+static int height_y;
+static int height_uv;
+static int width_uv;
+static int stride_uv;
+static int stride;
+static int lumimask;
+static int luminance;
+static double plane_summed_weights = 0;
+
+static short img12_sum_block[8*4096*4096*2] ;
+
+static short img1_sum[8*4096*2];
+static short img2_sum[8*4096*2];
+static int img1_sq_sum[8*4096*2];
+static int img2_sq_sum[8*4096*2];
+static int img12_mul_sum[8*4096*2];
+
+
+double vp8_similarity
+(
+ int mu_x,
+ int mu_y,
+ int pre_mu_x2,
+ int pre_mu_y2,
+ int pre_mu_xy2
+)
+{
+ int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;
+
+ mu_x2 = mu_x * mu_x;
+ mu_y2 = mu_y * mu_y;
+ mu_xy = mu_x * mu_y;
+
+ theta_x2 = 64 * pre_mu_x2 - mu_x2;
+ theta_y2 = 64 * pre_mu_y2 - mu_y2;
+ theta_xy = 64 * pre_mu_xy2 - mu_xy;
+
+ return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));
+}
+
+double vp8_ssim
+(
+ const unsigned char *img1,
+ const unsigned char *img2,
+ int stride_img1,
+ int stride_img2,
+ int width,
+ int height
+)
+{
+ int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;
+
+ double plane_quality, weight, mean;
+
+ short *img1_sum_ptr1, *img1_sum_ptr2;
+ short *img2_sum_ptr1, *img2_sum_ptr2;
+ int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;
+ int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;
+ int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;
+
+ plane_quality = 0;
+
+ if (lumimask)
+ plane_summed_weights = 0.0f;
+ else
+ plane_summed_weights = (height - 7) * (width - 7);
+
+ //some prologue for the main loop
+ temp = 8 * width;
+
+ img1_sum_ptr1 = img1_sum + temp;
+ img2_sum_ptr1 = img2_sum + temp;
+ img1_sq_sum_ptr1 = img1_sq_sum + temp;
+ img2_sq_sum_ptr1 = img2_sq_sum + temp;
+ img12_mul_sum_ptr1 = img12_mul_sum + temp;
+
+ for (x = 0; x < width; x++)
+ {
+ img1_sum[x] = img1[x];
+ img2_sum[x] = img2[x];
+ img1_sq_sum[x] = img1[x] * img1[x];
+ img2_sq_sum[x] = img2[x] * img2[x];
+ img12_mul_sum[x] = img1[x] * img2[x];
+
+ img1_sum_ptr1[x] = 0;
+ img2_sum_ptr1[x] = 0;
+ img1_sq_sum_ptr1[x] = 0;
+ img2_sq_sum_ptr1[x] = 0;
+ img12_mul_sum_ptr1[x] = 0;
+ }
+
+ //the main loop
+ for (y = 1; y < height; y++)
+ {
+ img1 += stride_img1;
+ img2 += stride_img2;
+
+ temp = (y - 1) % 9 * width;
+
+ img1_sum_ptr1 = img1_sum + temp;
+ img2_sum_ptr1 = img2_sum + temp;
+ img1_sq_sum_ptr1 = img1_sq_sum + temp;
+ img2_sq_sum_ptr1 = img2_sq_sum + temp;
+ img12_mul_sum_ptr1 = img12_mul_sum + temp;
+
+ temp = y % 9 * width;
+
+ img1_sum_ptr2 = img1_sum + temp;
+ img2_sum_ptr2 = img2_sum + temp;
+ img1_sq_sum_ptr2 = img1_sq_sum + temp;
+ img2_sq_sum_ptr2 = img2_sq_sum + temp;
+ img12_mul_sum_ptr2 = img12_mul_sum + temp;
+
+ for (x = 0; x < width; x++)
+ {
+ img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x];
+ img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x];
+ img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x];
+ img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x];
+ img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];
+ }
+
+ if (y > 6)
+ {
+ //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum
+ temp = (y + 1) % 9 * width;
+
+ img1_sum_ptr1 = img1_sum + temp;
+ img2_sum_ptr1 = img2_sum + temp;
+ img1_sq_sum_ptr1 = img1_sq_sum + temp;
+ img2_sq_sum_ptr1 = img2_sq_sum + temp;
+ img12_mul_sum_ptr1 = img12_mul_sum + temp;
+
+ for (x = 0; x < width; x++)
+ {
+ img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x];
+ img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x];
+ img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];
+ img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];
+ img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];
+ }
+
+ //here we calculate the sum over the 8x8 block of pixels
+ //this is done by sliding a window across the column sums for the last 8 lines
+ //each time adding the new column sum, and subtracting the one which fell out of the window
+ img1_block = 0;
+ img2_block = 0;
+ img1_sq_block = 0;
+ img2_sq_block = 0;
+ img12_mul_block = 0;
+
+ //prologue, and calculation of simularity measure from the first 8 column sums
+ for (x = 0; x < 8; x++)
+ {
+ img1_block += img1_sum_ptr1[x];
+ img2_block += img2_sum_ptr1[x];
+ img1_sq_block += img1_sq_sum_ptr1[x];
+ img2_sq_block += img2_sq_sum_ptr1[x];
+ img12_mul_block += img12_mul_sum_ptr1[x];
+ }
+
+ if (lumimask)
+ {
+ y2 = y - 7;
+ x2 = 0;
+
+ if (luminance)
+ {
+ mean = (img2_block + img1_block) / 128.0f;
+
+ if (!(y2 % 2 || x2 % 2))
+ *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
+ }
+ else
+ {
+ mean = *(img12_sum_block + y2 * width_uv + x2);
+ mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
+ mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
+ mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
+
+ mean /= 512.0f;
+ }
+
+ weight = mean < 40 ? 0.0f :
+ (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
+ plane_summed_weights += weight;
+
+ plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+ }
+ else
+ plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+
+ //and for the rest
+ for (x = 8; x < width; x++)
+ {
+ img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];
+ img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];
+ img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];
+ img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];
+ img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];
+
+ if (lumimask)
+ {
+ y2 = y - 7;
+ x2 = x - 7;
+
+ if (luminance)
+ {
+ mean = (img2_block + img1_block) / 128.0f;
+
+ if (!(y2 % 2 || x2 % 2))
+ *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
+ }
+ else
+ {
+ mean = *(img12_sum_block + y2 * width_uv + x2);
+ mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
+ mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
+ mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
+
+ mean /= 512.0f;
+ }
+
+ weight = mean < 40 ? 0.0f :
+ (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
+ plane_summed_weights += weight;
+
+ plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+ }
+ else
+ plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+ }
+ }
+ }
+
+ if (plane_summed_weights == 0)
+ return 1.0f;
+ else
+ return plane_quality / plane_summed_weights;
+}
+
+double vp8_calc_ssim
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ int lumamask,
+ double *weight
+)
+{
+ double a, b, c;
+ double frame_weight;
+ double ssimv;
+
+ width_y = source->y_width;
+ height_y = source->y_height;
+ height_uv = source->uv_height;
+ width_uv = source->uv_width;
+ stride_uv = dest->uv_stride;
+ stride = dest->y_stride;
+
+ lumimask = lumamask;
+
+ luminance = 1;
+ a = vp8_ssim(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride, source->y_width, source->y_height);
+ luminance = 0;
+
+ frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));
+
+ if (frame_weight == 0)
+ a = b = c = 1.0f;
+ else
+ {
+ b = vp8_ssim(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
+
+ c = vp8_ssim(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
+ }
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = frame_weight;
+
+ return ssimv;
+}
+
+// Google version of SSIM
+// SSIM
+#define KERNEL 3
+#define KERNEL_SIZE (2 * KERNEL + 1)
+
+typedef unsigned char uint8;
+typedef unsigned int uint32;
+
+static const int K[KERNEL_SIZE] =
+{
+ 1, 4, 11, 16, 11, 4, 1 // 16 * exp(-0.3 * i * i)
+};
+static const double ki_w = 1. / 2304.; // 1 / sum(i:0..6, j..6) K[i]*K[j]
+double get_ssimg(const uint8 *org, const uint8 *rec,
+ int xo, int yo, int W, int H,
+ const int stride1, const int stride2
+ )
+{
+ // TODO(skal): use summed tables
+ int y, x;
+
+ const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL;
+ const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL;
+ const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL;
+ const int xmax = (xo + KERNEL > W - 1) ? W - 1 : xo + KERNEL;
+ // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1)
+ // with a diff of 255, squares. That would a max error of 0x8ee0900,
+ // which fits into 32 bits integers.
+ uint32 w = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+ org += ymin * stride1;
+ rec += ymin * stride2;
+
+ for (y = ymin; y <= ymax; ++y, org += stride1, rec += stride2)
+ {
+ const int Wy = K[KERNEL + y - yo];
+
+ for (x = xmin; x <= xmax; ++x)
+ {
+ const int Wxy = Wy * K[KERNEL + x - xo];
+ // TODO(skal): inlined assembly
+ w += Wxy;
+ xm += Wxy * org[x];
+ ym += Wxy * rec[x];
+ xxm += Wxy * org[x] * org[x];
+ xym += Wxy * org[x] * rec[x];
+ yym += Wxy * rec[x] * rec[x];
+ }
+ }
+
+ {
+ const double iw = 1. / w;
+ const double iwx = xm * iw;
+ const double iwy = ym * iw;
+ double sxx = xxm * iw - iwx * iwx;
+ double syy = yym * iw - iwy * iwy;
+
+ // small errors are possible, due to rounding. Clamp to zero.
+ if (sxx < 0.) sxx = 0.;
+
+ if (syy < 0.) syy = 0.;
+
+ {
+ const double sxsy = sqrt(sxx * syy);
+ const double sxy = xym * iw - iwx * iwy;
+ static const double C11 = (0.01 * 0.01) * (255 * 255);
+ static const double C22 = (0.03 * 0.03) * (255 * 255);
+ static const double C33 = (0.015 * 0.015) * (255 * 255);
+ const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+ const double c = (2. * sxsy + C22) / (sxx + syy + C22);
+
+ const double s = (sxy + C33) / (sxsy + C33);
+ return l * c * s;
+
+ }
+ }
+
+}
+
+double get_ssimfull_kernelg(const uint8 *org, const uint8 *rec,
+ int xo, int yo, int W, int H,
+ const int stride1, const int stride2)
+{
+ // TODO(skal): use summed tables
+ // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1)
+ // with a diff of 255, squares. That would a max error of 0x8ee0900,
+ // which fits into 32 bits integers.
+ int y_, x_;
+ uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+ org += (yo - KERNEL) * stride1;
+ org += (xo - KERNEL);
+ rec += (yo - KERNEL) * stride2;
+ rec += (xo - KERNEL);
+
+ for (y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride1, rec += stride2)
+ {
+ const int Wy = K[y_];
+
+ for (x_ = 0; x_ < KERNEL_SIZE; ++x_)
+ {
+ const int Wxy = Wy * K[x_];
+ // TODO(skal): inlined assembly
+ const int org_x = org[x_];
+ const int rec_x = rec[x_];
+ xm += Wxy * org_x;
+ ym += Wxy * rec_x;
+ xxm += Wxy * org_x * org_x;
+ xym += Wxy * org_x * rec_x;
+ yym += Wxy * rec_x * rec_x;
+ }
+ }
+
+ {
+ const double iw = ki_w;
+ const double iwx = xm * iw;
+ const double iwy = ym * iw;
+ double sxx = xxm * iw - iwx * iwx;
+ double syy = yym * iw - iwy * iwy;
+
+ // small errors are possible, due to rounding. Clamp to zero.
+ if (sxx < 0.) sxx = 0.;
+
+ if (syy < 0.) syy = 0.;
+
+ {
+ const double sxsy = sqrt(sxx * syy);
+ const double sxy = xym * iw - iwx * iwy;
+ static const double C11 = (0.01 * 0.01) * (255 * 255);
+ static const double C22 = (0.03 * 0.03) * (255 * 255);
+ static const double C33 = (0.015 * 0.015) * (255 * 255);
+ const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+ const double c = (2. * sxsy + C22) / (sxx + syy + C22);
+ const double s = (sxy + C33) / (sxsy + C33);
+ return l * c * s;
+ }
+ }
+}
+
+double calc_ssimg(const uint8 *org, const uint8 *rec,
+ const int image_width, const int image_height,
+ const int stride1, const int stride2
+ )
+{
+ int j, i;
+ double SSIM = 0.;
+
+ for (j = 0; j < KERNEL; ++j)
+ {
+ for (i = 0; i < image_width; ++i)
+ {
+ SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+ }
+ }
+
+ for (j = KERNEL; j < image_height - KERNEL; ++j)
+ {
+ for (i = 0; i < KERNEL; ++i)
+ {
+ SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+ }
+
+ for (i = KERNEL; i < image_width - KERNEL; ++i)
+ {
+ SSIM += get_ssimfull_kernelg(org, rec, i, j,
+ image_width, image_height, stride1, stride2);
+ }
+
+ for (i = image_width - KERNEL; i < image_width; ++i)
+ {
+ SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+ }
+ }
+
+ for (j = image_height - KERNEL; j < image_height; ++j)
+ {
+ for (i = 0; i < image_width; ++i)
+ {
+ SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+ }
+ }
+
+ return SSIM;
+}
+
+
+double vp8_calc_ssimg
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *ssim_y,
+ double *ssim_u,
+ double *ssim_v
+)
+{
+ double ssim_all = 0;
+ int ysize = source->y_width * source->y_height;
+ int uvsize = ysize / 4;
+
+ *ssim_y = calc_ssimg(source->y_buffer, dest->y_buffer,
+ source->y_width, source->y_height,
+ source->y_stride, dest->y_stride);
+
+
+ *ssim_u = calc_ssimg(source->u_buffer, dest->u_buffer,
+ source->uv_width, source->uv_height,
+ source->uv_stride, dest->uv_stride);
+
+
+ *ssim_v = calc_ssimg(source->v_buffer, dest->v_buffer,
+ source->uv_width, source->uv_height,
+ source->uv_stride, dest->uv_stride);
+
+ ssim_all = (*ssim_y + *ssim_u + *ssim_v) / (ysize + uvsize + uvsize);
+ *ssim_y /= ysize;
+ *ssim_u /= uvsize;
+ *ssim_v /= uvsize;
+ return ssim_all;
+}
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
new file mode 100644
index 000000000..33ddd64e7
--- /dev/null
+++ b/vp8/encoder/tokenize.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* Global event counters used for accumulating statistics across several
+ compressions, then generating context.c = initial stats. */
+
+#ifdef ENTROPY_STATS
+_int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
+#endif
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x);
+
+TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+TOKENEXTRA *vp8_dct_value_tokens_ptr;
+int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+int *vp8_dct_value_cost_ptr;
+#if 0
+int skip_true_count = 0;
+int skip_false_count = 0;
+#endif
+static void fill_value_tokens()
+{
+
+ TOKENEXTRA *const t = vp8_dct_value_tokens + DCT_MAX_VALUE;
+ vp8_extra_bit_struct *const e = vp8_extra_bits;
+
+ int i = -DCT_MAX_VALUE;
+ int sign = 1;
+
+ do
+ {
+ if (!i)
+ sign = 0;
+
+ {
+ const int a = sign ? -i : i;
+ int eb = sign;
+
+ if (a > 4)
+ {
+ int j = 4;
+
+ while (++j < 11 && e[j].base_val <= a) {}
+
+ t[i].Token = --j;
+ eb |= (a - e[j].base_val) << 1;
+ }
+ else
+ t[i].Token = a;
+
+ t[i].Extra = eb;
+ }
+
+ // initialize the cost for extra bits for all possible coefficient value.
+ {
+ int cost = 0;
+ vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token;
+
+ if (p->base_val)
+ {
+ const int extra = t[i].Extra;
+ const int Length = p->Len;
+
+ if (Length)
+ cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+ cost += vp8_cost_bit(vp8_prob_half, extra & 1); /* sign */
+ vp8_dct_value_cost[i + DCT_MAX_VALUE] = cost;
+ }
+
+ }
+
+ }
+ while (++i < DCT_MAX_VALUE);
+
+ vp8_dct_value_tokens_ptr = vp8_dct_value_tokens + DCT_MAX_VALUE;
+ vp8_dct_value_cost_ptr = vp8_dct_value_cost + DCT_MAX_VALUE;
+}
+
+static void tokenize2nd_order_b
+(
+ const BLOCKD *const b,
+ TOKENEXTRA **tp,
+ const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+ const FRAME_TYPE frametype,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi
+)
+{
+ int pt; /* near block/prev token context index */
+ int c = 0; /* start at DC */
+ const int eob = b->eob; /* one beyond last nonzero coeff */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ int x;
+ const short *qcoeff_ptr = b->qcoeff;
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ do
+ {
+ const int band = vp8_coef_bands[c];
+
+ if (c < eob)
+ {
+ int rc = vp8_default_zig_zag1d[c];
+ const int v = qcoeff_ptr[rc];
+
+ assert(-DCT_MAX_VALUE <= v && v < (DCT_MAX_VALUE));
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ x = vp8_dct_value_tokens_ptr[v].Token;
+ }
+ else
+ x = DCT_EOB_TOKEN;
+
+ t->Token = x;
+ t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+ t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
+
+ t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+ ++cpi->coef_counts [type] [band] [pt] [x];
+ }
+ while (pt = vp8_prev_token_class[x], ++t, c < eob && ++c < 16);
+
+ *tp = t;
+ pt = (c != !type); /* 0 <-> all coeff data is zero */
+ *a = *l = pt;
+
+}
+
+static void tokenize1st_order_b
+(
+ const BLOCKD *const b,
+ TOKENEXTRA **tp,
+ const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+ const FRAME_TYPE frametype,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi
+)
+{
+ int pt; /* near block/prev token context index */
+ int c = type ? 0 : 1; /* start at DC unless type 0 */
+ const int eob = b->eob; /* one beyond last nonzero coeff */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ int x;
+ const short *qcoeff_ptr = b->qcoeff;
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ do
+ {
+ const int band = vp8_coef_bands[c];
+
+ x = DCT_EOB_TOKEN;
+
+ if (c < eob)
+ {
+ int rc = vp8_default_zig_zag1d[c];
+ const int v = qcoeff_ptr[rc];
+
+ assert(-DCT_MAX_VALUE <= v && v < (DCT_MAX_VALUE));
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ x = vp8_dct_value_tokens_ptr[v].Token;
+ }
+
+ t->Token = x;
+ t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+ t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
+ t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+ ++cpi->coef_counts [type] [band] [pt] [x];
+ }
+ while (pt = vp8_prev_token_class[x], ++t, c < eob && ++c < 16);
+
+ *tp = t;
+ pt = (c != !type); /* 0 <-> all coeff data is zero */
+ *a = *l = pt;
+
+}
+#if 0
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+ //int i;
+ ENTROPY_CONTEXT **const A = x->above_context;
+ ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+ int plane_type;
+ int b;
+
+ TOKENEXTRA *start = *t;
+ TOKENEXTRA *tp = *t;
+
+ x->mbmi.dc_diff = 1;
+
+ vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
+
+ if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
+ {
+ plane_type = 3;
+ }
+ else
+ {
+ tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type,
+ A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi);
+ plane_type = 0;
+
+ }
+
+ for (b = 0; b < 16; b++)
+ tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type,
+ A[vp8_block2context[b]] + vp8_block2above[b],
+ L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+ for (b = 16; b < 24; b++)
+ tokenize1st_order_b(x->block + b, t, 2, x->frame_type,
+ A[vp8_block2context[b]] + vp8_block2above[b],
+ L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ x->mbmi.mb_skip_coeff = 1;
+
+ while ((tp != *t) && x->mbmi.mb_skip_coeff)
+ {
+ x->mbmi.mb_skip_coeff = (x->mbmi.mb_skip_coeff && (tp->Token == DCT_EOB_TOKEN));
+ tp ++;
+ }
+
+ if (x->mbmi.mb_skip_coeff == 1)
+ {
+ x->mbmi.dc_diff = 0;
+ //redo the coutnts
+ vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts));
+
+ *t = start;
+ cpi->skip_true_count++;
+
+ //skip_true_count++;
+ }
+ else
+ {
+
+ cpi->skip_false_count++;
+ //skip_false_count++;
+ }
+ }
+}
+#else
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+ //int i;
+ ENTROPY_CONTEXT **const A = x->above_context;
+ ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+ int plane_type;
+ int b;
+
+ TOKENEXTRA *start = *t;
+ TOKENEXTRA *tp = *t;
+
+ x->mbmi.dc_diff = 1;
+
+#if 0
+
+ if (x->mbmi.force_no_skip)
+ {
+ x->mbmi.mb_skip_coeff = 1;
+ //reset for next_mb.
+ x->mbmi.force_no_skip = 0;
+ }
+
+#endif
+
+#if 1
+
+ if (x->mbmi.mb_skip_coeff)
+ {
+
+ cpi->skip_true_count++;
+
+ if (!cpi->common.mb_no_coeff_skip)
+ vp8_stuff_mb(cpi, x, t) ;
+ else
+ {
+ vp8_fix_contexts(cpi, x);
+ }
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ x->mbmi.dc_diff = 0;
+ else
+ x->mbmi.dc_diff = 1;
+
+
+ return;
+ }
+
+ cpi->skip_false_count++;
+#endif
+#if 0
+
+ if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
+ {
+ int i, skip = 1;
+
+ for (i = 0; i < 24; i++)
+ skip &= (!x->block[i].eob);
+
+ if (skip != x->mbmi.mb_skip_coeff)
+ skip += 0;
+
+ x->mbmi.mb_skip_coeff = skip;
+ }
+ else
+ {
+ int i, skip = 1;
+
+ for (i = 0; i < 16; i++)
+ skip &= (x->block[i].eob < 2);
+
+ for (i = 16; i < 25; i++)
+ skip &= (!x->block[i].eob);
+
+ if (skip != x->mbmi.mb_skip_coeff)
+ skip += 0;
+
+ x->mbmi.mb_skip_coeff = skip;
+ }
+
+ vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
+#endif
+
+ if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
+ {
+ plane_type = 3;
+ }
+ else
+ {
+ tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type,
+ A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi);
+ plane_type = 0;
+
+ }
+
+ for (b = 0; b < 16; b++)
+ tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type,
+ A[vp8_block2context[b]] + vp8_block2above[b],
+ L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+ for (b = 16; b < 24; b++)
+ tokenize1st_order_b(x->block + b, t, 2, x->frame_type,
+ A[vp8_block2context[b]] + vp8_block2above[b],
+ L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+#if 0
+
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ int skip = 1;
+
+ while ((tp != *t) && skip)
+ {
+ skip = (skip && (tp->Token == DCT_EOB_TOKEN));
+ tp ++;
+ }
+
+ if (skip != x->mbmi.mb_skip_coeff)
+ skip += 0;
+
+ x->mbmi.mb_skip_coeff = skip;
+
+ if (x->mbmi.mb_skip_coeff == 1)
+ {
+ x->mbmi.dc_diff = 0;
+ //redo the coutnts
+ vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts));
+
+ *t = start;
+ cpi->skip_true_count++;
+ //skip_true_count++;
+ }
+ else
+ {
+
+ cpi->skip_false_count++;
+ //skip_false_count++;
+ }
+ }
+
+#endif
+}
+#endif
+
+#ifdef ENTROPY_STATS
+
+void init_context_counters(void)
+{
+ vpx_memset(context_counters, 0, sizeof(context_counters));
+}
+
+void print_context_counters()
+{
+
+ int type, band, pt, t;
+
+ FILE *const f = fopen("context.c", "w");
+
+ fprintf(f, "#include \"entropy.h\"\n");
+
+ fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+
+ fprintf(f, "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];\n\n");
+
+ fprintf(f, "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] = {");
+
+# define Comma( X) (X? ",":"")
+
+ type = 0;
+
+ do
+ {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+
+ band = 0;
+
+ do
+ {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+
+ pt = 0;
+
+ do
+ {
+ fprintf(f, "%s\n {", Comma(pt));
+
+ t = 0;
+
+ do
+ {
+ const _int64 x = context_counters [type] [band] [pt] [t];
+ const int y = (int) x;
+
+ assert(x == (_int64) y); /* no overflow handling yet */
+ fprintf(f, "%s %d", Comma(t), y);
+
+ }
+ while (++t < vp8_coef_tokens);
+
+ fprintf(f, "}");
+ }
+ while (++pt < PREV_COEF_CONTEXTS);
+
+ fprintf(f, "\n }");
+
+ }
+ while (++band < COEF_BANDS);
+
+ fprintf(f, "\n }");
+ }
+ while (++type < BLOCK_TYPES);
+
+ fprintf(f, "\n};\n");
+ fclose(f);
+}
+#endif
+
+
+void vp8_tokenize_initialize()
+{
+ fill_value_tokens();
+}
+
+
+static __inline void stuff2nd_order_b
+(
+ const BLOCKD *const b,
+ TOKENEXTRA **tp,
+ const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+ const FRAME_TYPE frametype,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi
+)
+{
+ int pt; /* near block/prev token context index */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ (void) frametype;
+ (void) type;
+ (void) b;
+
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+ t->section = 11;
+ t->skip_eob_node = 0;
+ ++cpi->coef_counts [1] [0] [pt] [DCT_EOB_TOKEN];
+ ++t;
+
+ *tp = t;
+ pt = 0;
+ *a = *l = pt;
+
+}
+
+static __inline void stuff1st_order_b
+(
+ const BLOCKD *const b,
+ TOKENEXTRA **tp,
+ const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+ const FRAME_TYPE frametype,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi
+)
+{
+ int pt; /* near block/prev token context index */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ (void) frametype;
+ (void) type;
+ (void) b;
+
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
+ t->section = 8;
+ t->skip_eob_node = 0;
+ ++cpi->coef_counts [0] [1] [pt] [DCT_EOB_TOKEN];
+ ++t;
+ *tp = t;
+ pt = 0; /* 0 <-> all coeff data is zero */
+ *a = *l = pt;
+
+}
+static __inline
+void stuff1st_order_buv
+(
+ const BLOCKD *const b,
+ TOKENEXTRA **tp,
+ const int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+ const FRAME_TYPE frametype,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi
+)
+{
+ int pt; /* near block/prev token context index */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ (void) frametype;
+ (void) type;
+ (void) b;
+
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+ t->section = 13;
+ t->skip_eob_node = 0;
+ ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+ ++t;
+ *tp = t;
+ pt = 0; /* 0 <-> all coeff data is zero */
+ *a = *l = pt;
+
+}
+
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+ //int i;
+ ENTROPY_CONTEXT **const A = x->above_context;
+ ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+ int plane_type;
+ int b;
+
+ stuff2nd_order_b(x->block + 24, t, 1, x->frame_type,
+ A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi);
+ plane_type = 0;
+
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ x->mbmi.dc_diff = 0;
+ else
+ x->mbmi.dc_diff = 1;
+
+
+ for (b = 0; b < 16; b++)
+ stuff1st_order_b(x->block + b, t, plane_type, x->frame_type,
+ A[vp8_block2context[b]] + vp8_block2above[b],
+ L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+ for (b = 16; b < 24; b++)
+ stuff1st_order_buv(x->block + b, t, 2, x->frame_type,
+ A[vp8_block2context[b]] + vp8_block2above[b],
+ L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+}
+void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x)
+{
+ x->left_context[Y1CONTEXT][0] = 0;
+ x->left_context[Y1CONTEXT][1] = 0;
+ x->left_context[Y1CONTEXT][2] = 0;
+ x->left_context[Y1CONTEXT][3] = 0;
+ x->left_context[UCONTEXT][0] = 0;
+ x->left_context[VCONTEXT][0] = 0;
+ x->left_context[UCONTEXT][1] = 0;
+ x->left_context[VCONTEXT][1] = 0;
+
+ x->above_context[Y1CONTEXT][0] = 0;
+ x->above_context[Y1CONTEXT][1] = 0;
+ x->above_context[Y1CONTEXT][2] = 0;
+ x->above_context[Y1CONTEXT][3] = 0;
+ x->above_context[UCONTEXT][0] = 0;
+ x->above_context[VCONTEXT][0] = 0;
+ x->above_context[UCONTEXT][1] = 0;
+ x->above_context[VCONTEXT][1] = 0;
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ {
+ x->left_context[Y2CONTEXT][0] = 0;
+ x->above_context[Y2CONTEXT][0] = 0;
+ }
+}
diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h
new file mode 100644
index 000000000..02aacc222
--- /dev/null
+++ b/vp8/encoder/tokenize.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef tokenize_h
+#define tokenize_h
+
+#include "entropy.h"
+#include "block.h"
+
+void vp8_tokenize_initialize();
+
+typedef struct
+{
+ int Token;
+ int Extra;
+ const vp8_prob *context_tree;
+ int skip_eob_node;
+ int section;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+#ifdef ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
+#endif
+
+
+#endif /* tokenize_h */
diff --git a/vp8/encoder/treewriter.c b/vp8/encoder/treewriter.c
new file mode 100644
index 000000000..e398044db
--- /dev/null
+++ b/vp8/encoder/treewriter.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+ int *const C,
+ vp8_tree T,
+ const vp8_prob *const P,
+ int i,
+ int c
+)
+{
+ const vp8_prob p = P [i>>1];
+
+ do
+ {
+ const vp8_tree_index j = T[i];
+ const int d = c + vp8_cost_bit(p, i & 1);
+
+ if (j <= 0)
+ C[-j] = d;
+ else
+ cost(C, T, P, j, d);
+ }
+ while (++i & 1);
+}
+void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t)
+{
+ cost(c, t, p, 0, 0);
+}
diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
new file mode 100644
index 000000000..05ac74cb7
--- /dev/null
+++ b/vp8/encoder/treewriter.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREEWRITER_H
+#define __INC_TREEWRITER_H
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+ bit coder. Timothy S Murphy 11 October 2004 */
+
+#include "treecoder.h"
+
+#include "boolhuff.h" /* for now */
+
+typedef BOOL_CODER vp8_writer;
+
+#define vp8_write vp8_encode_bool
+#define vp8_write_literal vp8_encode_value
+#define vp8_write_bit( W, V) vp8_write( W, V, vp8_prob_half)
+
+#define vp8bc_write vp8bc_write_bool
+#define vp8bc_write_literal vp8bc_write_bits
+#define vp8bc_write_bit( W, V) vp8bc_write_bits( W, V, 1)
+
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp8_cost_zero( x) ( vp8_prob_cost[x])
+#define vp8_cost_one( x) vp8_cost_zero( vp8_complement(x))
+
+#define vp8_cost_bit( x, b) vp8_cost_zero( (b)? vp8_complement(x) : (x) )
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static __inline unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+{
+ /* Imitate existing calculation */
+
+ return ((ct[0] * vp8_cost_zero(p))
+ + (ct[1] * vp8_cost_one(p))) >> 8;
+}
+
+/* Small functions to write explicit values and tokens, as well as
+ estimate their lengths. */
+
+static __inline void vp8_treed_write
+(
+ vp8_writer *const w,
+ vp8_tree t,
+ const vp8_prob *const p,
+ int v,
+ int n /* number of bits in v, assumed nonzero */
+)
+{
+ vp8_tree_index i = 0;
+
+ do
+ {
+ const int b = (v >> --n) & 1;
+ vp8_write(w, b, p[i>>1]);
+ i = t[i+b];
+ }
+ while (n);
+}
+static __inline void vp8_write_token
+(
+ vp8_writer *const w,
+ vp8_tree t,
+ const vp8_prob *const p,
+ vp8_token *const x
+)
+{
+ vp8_treed_write(w, t, p, x->value, x->Len);
+}
+
+static __inline int vp8_treed_cost(
+ vp8_tree t,
+ const vp8_prob *const p,
+ int v,
+ int n /* number of bits in v, assumed nonzero */
+)
+{
+ int c = 0;
+ vp8_tree_index i = 0;
+
+ do
+ {
+ const int b = (v >> --n) & 1;
+ c += vp8_cost_bit(p[i>>1], b);
+ i = t[i+b];
+ }
+ while (n);
+
+ return c;
+}
+static __inline int vp8_cost_token
+(
+ vp8_tree t,
+ const vp8_prob *const p,
+ vp8_token *const x
+)
+{
+ return vp8_treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp8_cost_tokens(
+ int *Costs, const vp8_prob *, vp8_tree
+);
+
+#endif
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
new file mode 100644
index 000000000..b3b55c319
--- /dev/null
+++ b/vp8/encoder/variance.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_H
+#define VARIANCE_H
+
+#define prototype_sad(sym)\
+ unsigned int (sym)\
+ (\
+ unsigned char *src_ptr, \
+ int source_stride, \
+ unsigned char *ref_ptr, \
+ int ref_stride, \
+ int max_sad\
+ )
+
+#define prototype_sad_multi_same_address(sym)\
+ void (sym)\
+ (\
+ unsigned char *src_ptr, \
+ int source_stride, \
+ unsigned char *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array\
+ )
+
+#define prototype_sad_multi_dif_address(sym)\
+ void (sym)\
+ (\
+ unsigned char *src_ptr, \
+ int source_stride, \
+ unsigned char *ref_ptr[4], \
+ int ref_stride, \
+ unsigned int *sad_array\
+ )
+
+#define prototype_variance(sym) \
+ unsigned int (sym) \
+ (\
+ unsigned char *src_ptr, \
+ int source_stride, \
+ unsigned char *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sse\
+ )
+
+#define prototype_variance2(sym) \
+ unsigned int (sym) \
+ (\
+ unsigned char *src_ptr, \
+ int source_stride, \
+ unsigned char *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sse,\
+ int *sum\
+ )
+
+#define prototype_subpixvariance(sym) \
+ unsigned int (sym) \
+ ( \
+ unsigned char *src_ptr, \
+ int source_stride, \
+ int xoffset, \
+ int yoffset, \
+ unsigned char *ref_ptr, \
+ int Refstride, \
+ unsigned int *sse \
+ );
+
+
+#define prototype_getmbss(sym) unsigned int (sym)(short *)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/variance_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/variance_arm.h"
+#endif
+
+#ifndef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_c
+#endif
+extern prototype_sad(vp8_variance_sad4x4);
+
+#ifndef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_c
+#endif
+extern prototype_sad(vp8_variance_sad8x8);
+
+#ifndef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_c
+#endif
+extern prototype_sad(vp8_variance_sad8x16);
+
+#ifndef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_c
+#endif
+extern prototype_sad(vp8_variance_sad16x8);
+
+#ifndef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_c
+#endif
+extern prototype_sad(vp8_variance_sad16x16);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad16x16x3);
+
+#ifndef vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad16x8x3);
+
+#ifndef vp8_variance_sad8x8x3
+#define vp8_variance_sad8x8x3 vp8_sad8x8x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad8x8x3);
+
+#ifndef vp8_variance_sad8x16x3
+#define vp8_variance_sad8x16x3 vp8_sad8x16x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
+
+#ifndef vp8_variance_sad4x4x3
+#define vp8_variance_sad4x4x3 vp8_sad4x4x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_sad16x16x4d
+#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad16x16x4d);
+
+#ifndef vp8_variance_sad16x8x4d
+#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad16x8x4d);
+
+#ifndef vp8_variance_sad8x8x4d
+#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad8x8x4d);
+
+#ifndef vp8_variance_sad8x16x4d
+#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d);
+
+#ifndef vp8_variance_sad4x4x4d
+#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_c
+#endif
+extern prototype_variance(vp8_variance_var4x4);
+
+#ifndef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_c
+#endif
+extern prototype_variance(vp8_variance_var8x8);
+
+#ifndef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_c
+#endif
+extern prototype_variance(vp8_variance_var8x16);
+
+#ifndef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_c
+#endif
+extern prototype_variance(vp8_variance_var16x8);
+
+#ifndef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_c
+#endif
+extern prototype_variance(vp8_variance_var16x16);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar4x4);
+
+#ifndef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar8x8);
+
+#ifndef vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar8x16);
+
+#ifndef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
+
+#ifndef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
+
+#ifndef vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_c
+#endif
+extern prototype_getmbss(vp8_variance_getmbss);
+
+#ifndef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_c
+#endif
+extern prototype_variance(vp8_variance_mse16x16);
+
+#ifndef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_c
+#endif
+extern prototype_sad(vp8_variance_get16x16prederror);
+
+#ifndef vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_c
+#endif
+extern prototype_variance2(vp8_variance_get8x8var);
+
+#ifndef vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_c
+#endif
+extern prototype_variance2(vp8_variance_get16x16var);
+
+#ifndef vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c
+#endif
+extern prototype_sad(vp8_variance_get4x4sse_cs);
+
+
+typedef prototype_sad(*vp8_sad_fn_t);
+typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
+typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
+typedef prototype_variance(*vp8_variance_fn_t);
+typedef prototype_variance2(*vp8_variance2_fn_t);
+typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
+typedef prototype_getmbss(*vp8_getmbss_fn_t);
+typedef struct
+{
+ vp8_sad_fn_t sad4x4;
+ vp8_sad_fn_t sad8x8;
+ vp8_sad_fn_t sad8x16;
+ vp8_sad_fn_t sad16x8;
+ vp8_sad_fn_t sad16x16;
+
+ vp8_variance_fn_t var4x4;
+ vp8_variance_fn_t var8x8;
+ vp8_variance_fn_t var8x16;
+ vp8_variance_fn_t var16x8;
+ vp8_variance_fn_t var16x16;
+
+ vp8_subpixvariance_fn_t subpixvar4x4;
+ vp8_subpixvariance_fn_t subpixvar8x8;
+ vp8_subpixvariance_fn_t subpixvar8x16;
+ vp8_subpixvariance_fn_t subpixvar16x8;
+ vp8_subpixvariance_fn_t subpixvar16x16;
+ vp8_subpixvariance_fn_t subpixmse16x16;
+
+ vp8_getmbss_fn_t getmbss;
+ vp8_variance_fn_t mse16x16;
+
+ vp8_sad_fn_t get16x16prederror;
+ vp8_variance2_fn_t get8x8var;
+ vp8_variance2_fn_t get16x16var;
+ vp8_sad_fn_t get4x4sse_cs;
+
+ vp8_sad_multi_fn_t sad16x16x3;
+ vp8_sad_multi_fn_t sad16x8x3;
+ vp8_sad_multi_fn_t sad8x16x3;
+ vp8_sad_multi_fn_t sad8x8x3;
+ vp8_sad_multi_fn_t sad4x4x3;
+
+ vp8_sad_multi_d_fn_t sad16x16x4d;
+ vp8_sad_multi_d_fn_t sad16x8x4d;
+ vp8_sad_multi_d_fn_t sad8x16x4d;
+ vp8_sad_multi_d_fn_t sad8x8x4d;
+ vp8_sad_multi_d_fn_t sad4x4x4d;
+
+} vp8_variance_rtcd_vtable_t;
+
+typedef struct
+{
+ vp8_sad_fn_t sdf;
+ vp8_sad_multi_fn_t sdx3f;
+ vp8_sad_multi_d_fn_t sdx4df;
+ vp8_variance_fn_t vf;
+ vp8_subpixvariance_fn_t svf;
+} vp8_variance_fn_ptr_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
+#endif
+
+/* TODO: Determine if this USEBILINEAR flag is necessary. */
+#define USEBILINEAR
+
+#endif
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
new file mode 100644
index 000000000..85269b9d3
--- /dev/null
+++ b/vp8/encoder/variance_c.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+
+const int vp8_six_tap[8][6] =
+{
+ { 0, 0, 128, 0, 0, 0 }, // note that 1/8 pel positions are just as per alpha -0.5 bicubic
+ { 0, -6, 123, 12, -1, 0 },
+ { 2, -11, 108, 36, -8, 1 }, // New 1/4 pel 6 tap filter
+ { 0, -9, 93, 50, -6, 0 },
+ { 3, -16, 77, 77, -16, 3 }, // New 1/2 pel 6 tap filter
+ { 0, -6, 50, 93, -9, 0 },
+ { 1, -8, 36, 108, -11, 2 }, // New 1/4 pel 6 tap filter
+ { 0, -1, 12, 123, -6, 0 }
+};
+
+
+#ifdef USEBILINEAR
+const int VP8_FILTER_WEIGHT = 128;
+const int VP8_FILTER_SHIFT = 7;
+const int vp8_bilinear_taps[8][2] =
+{
+ { 128, 0 },
+ { 112, 16 },
+ { 96, 32 },
+ { 80, 48 },
+ { 64, 64 },
+ { 48, 80 },
+ { 32, 96 },
+ { 16, 112 }
+};
+
+unsigned int vp8_get_mb_ss_c
+(
+ short *src_ptr
+)
+{
+ unsigned int i = 0, sum = 0;
+
+ do
+ {
+ sum += (src_ptr[i] * src_ptr[i]);
+ i++;
+ }
+ while (i < 256);
+
+ return sum;
+}
+
+
+void vp8_variance(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ int w,
+ int h,
+ unsigned int *sse,
+ int *sum)
+{
+ int i, j;
+ int diff;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++)
+ {
+ for (j = 0; j < w; j++)
+ {
+ diff = src_ptr[j] - ref_ptr[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+}
+
+unsigned int
+vp8_get8x8var_c
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+)
+{
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
+ return (*SSE - (((*Sum) * (*Sum)) >> 6));
+}
+
+unsigned int
+vp8_get16x16var_c
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+)
+{
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
+ return (*SSE - (((*Sum) * (*Sum)) >> 8));
+
+}
+
+
+
+unsigned int vp8_variance16x16_c(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp8_variance8x16_c(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+}
+
+unsigned int vp8_variance16x8_c(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+}
+
+
+unsigned int vp8_variance8x8_c(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 6));
+}
+
+unsigned int vp8_variance4x4_c(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 4));
+}
+
+
+unsigned int vp8_mse16x16_c(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+ *sse = var;
+ return var;
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * UINT32 pixel_step : Offset between filter input samples (see notes).
+ * UINT32 output_height : Input block height.
+ * UINT32 output_width : Input block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement first-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+void vp8e_filter_block2d_bil_first_pass
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int *vp8_filter
+)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ // Apply bilinear filter
+ output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * UINT32 pixel_step : Offset between filter input samples (see notes).
+ * UINT32 output_height : Input block height.
+ * UINT32 output_width : Input block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement second-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+void vp8e_filter_block2d_bil_second_pass
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ // Apply filter
+ Temp = ((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2);
+ output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * INT32 *HFilter : Array of 2 horizontal filter taps.
+ * INT32 *VFilter : Array of 2 vertical filter taps.
+ *
+ * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 2-D filters an 8x8 input block by applying a 2-tap
+ * bi-linear filter horizontally followed by a 2-tap
+ * bi-linear filter vertically on the result.
+ *
+ * SPECIAL NOTES : The intermediate horizontally filtered block must produce
+ * 1 more point than the input block in each column. This
+ * is to ensure that the 2-tap filter has one extra data-point
+ * at the top of each column so filter taps do not extend
+ * beyond data. Thus the output of the first stage filter
+ * is an 8x9 (hx_v) block.
+ *
+ ****************************************************************************/
+void vp8e_filter_block2d_bil
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ int *HFilter,
+ int *VFilter
+)
+{
+
+ unsigned short FData[20*16]; // Temp data bufffer used in filtering
+
+ // First filter 1-D horizontally...
+ vp8e_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+ // then 1-D vertically...
+ vp8e_filter_block2d_bil_second_pass(FData, output_ptr, 8, 8, 8, 8, VFilter);
+}
+
+
+
+unsigned int vp8_sub_pixel_variance4x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned char temp2[20*16];
+ const int *HFilter, *VFilter;
+ unsigned short FData3[5*4]; // Temp data bufffer used in filtering
+
+ HFilter = vp8_bilinear_taps[xoffset];
+ VFilter = vp8_bilinear_taps[yoffset];
+
+ // First filter 1d Horizontal
+ vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+
+ // Now filter Verticaly
+ vp8e_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
+
+ return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[9*8]; // Temp data bufffer used in filtering
+ unsigned char temp2[20*16];
+ const int *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_taps[xoffset];
+ VFilter = vp8_bilinear_taps[yoffset];
+
+ vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+ vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+
+ return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[17*16]; // Temp data bufffer used in filtering
+ unsigned char temp2[20*16];
+ const int *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_taps[xoffset];
+ VFilter = vp8_bilinear_taps[yoffset];
+
+ vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+ vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+
+ return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_mse16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[16*9]; // Temp data bufffer used in filtering
+ unsigned char temp2[20*16];
+ const int *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_taps[xoffset];
+ VFilter = vp8_bilinear_taps[yoffset];
+
+ vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+ vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+
+ return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance8x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[9*16]; // Temp data bufffer used in filtering
+ unsigned char temp2[20*16];
+ const int *HFilter, *VFilter;
+
+
+ HFilter = vp8_bilinear_taps[xoffset];
+ VFilter = vp8_bilinear_taps[yoffset];
+
+
+ vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
+ vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+
+ return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
new file mode 100644
index 000000000..186ee6856
--- /dev/null
+++ b/vp8/encoder/x86/csystemdependent.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *, short *);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern int vp8_block_error_c(short *, short *);
+extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_mmx(short *, short *);
+extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_xmm(short *, short *);
+extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
+
+
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// c imports
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
+
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction vp8_sad16x16_c;
+extern SADFunction vp8_sad16x8_c;
+extern SADFunction vp8_sad8x16_c;
+extern SADFunction vp8_sad8x8_c;
+extern SADFunction vp8_sad4x4_c;
+
+extern SADFunction vp8_sad16x16_wmt;
+extern SADFunction vp8_sad16x8_wmt;
+extern SADFunction vp8_sad8x16_wmt;
+extern SADFunction vp8_sad8x8_wmt;
+extern SADFunction vp8_sad4x4_wmt;
+
+extern SADFunction vp8_sad16x16_mmx;
+extern SADFunction vp8_sad16x8_mmx;
+extern SADFunction vp8_sad8x16_mmx;
+extern SADFunction vp8_sad8x8_mmx;
+extern SADFunction vp8_sad4x4_mmx;
+
+extern variance_function vp8_variance16x16_c;
+extern variance_function vp8_variance8x16_c;
+extern variance_function vp8_variance16x8_c;
+extern variance_function vp8_variance8x8_c;
+extern variance_function vp8_variance4x4_c;
+extern variance_function vp8_mse16x16_c;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// mmx imports
+extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
+extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_mmx;
+extern variance_function vp8_variance8x8_mmx;
+extern variance_function vp8_variance8x16_mmx;
+extern variance_function vp8_variance16x8_mmx;
+extern variance_function vp8_variance16x16_mmx;
+
+extern variance_function vp8_mse16x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
+
+extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_mmx(short *);
+extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+
+// wmt imports
+extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
+extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_wmt;
+extern variance_function vp8_variance8x8_wmt;
+extern variance_function vp8_variance8x16_wmt;
+extern variance_function vp8_variance16x8_wmt;
+extern variance_function vp8_variance16x16_wmt;
+
+extern variance_function vp8_mse16x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
+extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
+extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+void vp8_cmachine_specific_config(void)
+{
+ int mmx_enabled;
+ int xmm_enabled;
+ int wmt_enabled;
+
+ vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+ if (wmt_enabled) // Willamette
+ {
+ // Willamette instruction set available:
+ vp8_mbuverror = vp8_mbuverror_xmm;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
+ vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt;
+ vp8_subtract_b = vp8_subtract_b_mmx;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
+ vp8_variance4x4 = vp8_variance4x4_mmx;
+ vp8_variance8x8 = vp8_variance8x8_mmx;
+ vp8_variance8x16 = vp8_variance8x16_wmt;
+ vp8_variance16x8 = vp8_variance16x8_wmt;
+ vp8_variance16x16 = vp8_variance16x16_wmt;
+ vp8_mse16x16 = vp8_mse16x16_wmt;
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt;
+ vp8_get_mb_ss = vp8_get_mb_ss_sse2;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_sse2;
+ vp8_get8x8var = vp8_get8x8var_sse2;
+ vp8_get16x16var = vp8_get16x16var_sse2;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx;
+ vp8_sad16x16 = vp8_sad16x16_wmt;
+ vp8_sad16x8 = vp8_sad16x8_wmt;
+ vp8_sad8x16 = vp8_sad8x16_wmt;
+ vp8_sad8x8 = vp8_sad8x8_wmt;
+ vp8_sad4x4 = vp8_sad4x4_wmt;
+ vp8_block_error = vp8_block_error_xmm;
+ vp8_mbblock_error = vp8_mbblock_error_xmm;
+ vp8_subtract_mby = vp8_subtract_mby_mmx;
+
+ }
+ else if (mmx_enabled)
+ {
+ // MMX instruction set available:
+ vp8_mbuverror = vp8_mbuverror_mmx;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
+ vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx;
+ vp8_subtract_b = vp8_subtract_b_mmx;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
+ vp8_variance4x4 = vp8_variance4x4_mmx;
+ vp8_variance8x8 = vp8_variance8x8_mmx;
+ vp8_variance8x16 = vp8_variance8x16_mmx;
+ vp8_variance16x8 = vp8_variance16x8_mmx;
+ vp8_variance16x16 = vp8_variance16x16_mmx;
+ vp8_mse16x16 = vp8_mse16x16_mmx;
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx;
+ vp8_get_mb_ss = vp8_get_mb_ss_mmx;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_mmx;
+ vp8_get8x8var = vp8_get8x8var_mmx;
+ vp8_get16x16var = vp8_get16x16var_mmx;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx;
+ vp8_sad16x16 = vp8_sad16x16_mmx;
+ vp8_sad16x8 = vp8_sad16x8_mmx;
+ vp8_sad8x16 = vp8_sad8x16_mmx;
+ vp8_sad8x8 = vp8_sad8x8_mmx;
+ vp8_sad4x4 = vp8_sad4x4_mmx;
+ vp8_block_error = vp8_block_error_mmx;
+ vp8_mbblock_error = vp8_mbblock_error_mmx;
+ vp8_subtract_mby = vp8_subtract_mby_mmx;
+
+ }
+ else
+ {
+ // Pure C:
+ vp8_mbuverror = vp8_mbuverror_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+ vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
+ vp8_subtract_b = vp8_subtract_b_c;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_c;
+ vp8_variance4x4 = vp8_variance4x4_c;
+ vp8_variance8x8 = vp8_variance8x8_c;
+ vp8_variance8x16 = vp8_variance8x16_c;
+ vp8_variance16x8 = vp8_variance16x8_c;
+ vp8_variance16x16 = vp8_variance16x16_c;
+ vp8_mse16x16 = vp8_mse16x16_c;
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c;
+ vp8_get_mb_ss = vp8_get_mb_ss_c;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_c;
+ vp8_get8x8var = vp8_get8x8var_c;
+ vp8_get16x16var = vp8_get16x16var_c;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
+ vp8_sad16x16 = vp8_sad16x16_c;
+ vp8_sad16x8 = vp8_sad16x8_c;
+ vp8_sad8x16 = vp8_sad8x16_c;
+ vp8_sad8x8 = vp8_sad8x8_c;
+ vp8_sad4x4 = vp8_sad4x4_c;
+ vp8_block_error = vp8_block_error_c;
+ vp8_mbblock_error = vp8_mbblock_error_c;
+ vp8_subtract_mby = vp8_subtract_mby_c;
+ }
+
+}
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 000000000..e13423796
--- /dev/null
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -0,0 +1,846 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+ global sym(vp8_short_fdct4x4_mmx)
+ global sym(vp8_fast_fdct4x4_mmx)
+ global sym(vp8_fast_fdct8x4_wmt)
+
+
+%define DCTCONSTANTSBITS (16)
+%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
+%define x_c1 (60547) ; cos(pi /8) * (1<<15)
+%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
+%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
+
+
+%define _1STSTAGESHIFT 14
+%define _2NDSTAGESHIFT 16
+
+; using matrix multiply with source and destbuffer has a pitch
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_short_fdct4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;input
+ mov rdi, arg(1) ;output
+
+ movsxd rax, dword ptr arg(2) ;pitch
+ lea rdx, [dct_matrix GLOBAL]
+
+ movq mm0, [rsi ]
+ movq mm1, [rsi + rax]
+
+ movq mm2, [rsi + rax*2]
+ lea rsi, [rsi + rax*2]
+
+ movq mm3, [rsi + rax]
+
+ ; first column
+ movq mm4, mm0
+ movq mm7, [rdx]
+
+ pmaddwd mm4, mm7
+ movq mm5, mm1
+
+ pmaddwd mm5, mm7
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+
+ pmaddwd mm5, mm7
+ movq mm6, mm3
+
+ pmaddwd mm6, mm7
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _1STSTAGESHIFT
+ psrad mm5, _1STSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi], mm4
+
+ ;second column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+8]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+8]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _1STSTAGESHIFT
+ psrad mm5, _1STSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+8], mm4
+
+
+ ;third column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+16]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+16]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _1STSTAGESHIFT
+ psrad mm5, _1STSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+16], mm4
+
+ ;fourth column (this is the last column, so we do not have save the source any more)
+
+ pmaddwd mm0, [rdx+24]
+
+ pmaddwd mm1, [rdx+24]
+ movq mm6, mm0
+
+ punpckldq mm0, mm1
+ punpckhdq mm6, mm1
+
+ paddd mm0, mm6
+
+ pmaddwd mm2, [rdx+24]
+
+ pmaddwd mm3, [rdx+24]
+ movq mm7, mm2
+
+ punpckldq mm2, mm3
+ punpckhdq mm7, mm3
+
+ paddd mm2, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm0, mm6
+ paddd mm2, mm6
+
+ psrad mm0, _1STSTAGESHIFT
+ psrad mm2, _1STSTAGESHIFT
+
+ packssdw mm0, mm2
+
+ movq mm3, mm0
+
+ ; done with one pass
+ ; now start second pass
+ movq mm0, [rdi ]
+ movq mm1, [rdi+ 8]
+ movq mm2, [rdi+ 16]
+
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi], mm4
+
+ ;second column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+8]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+8]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+8], mm4
+
+
+ ;third column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+16]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+16]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+16], mm4
+
+ ;fourth column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+24]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+24]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+24]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+24]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+24], mm4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_fast_fdct4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;input
+ mov rdi, arg(1) ;output
+
+ lea rdx, [dct_const_mmx GLOBAL]
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
+ ; read the input data
+ movq mm0, [rsi]
+ movq mm1, [rsi + rax ]
+
+ movq mm2, [rcx]
+ movq mm3, [rcx + rax]
+ ; get the constants
+ ;shift to left by 1 for prescision
+ paddw mm0, mm0
+ paddw mm1, mm1
+
+ psllw mm2, 1
+ psllw mm3, 1
+
+ ; transpose for the second stage
+ movq mm4, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 10 11 12 03
+
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm4, mm1 ; 02 12 03 13
+
+ punpcklwd mm2, mm3 ; 20 30 21 31
+ punpckhwd mm5, mm3 ; 22 32 23 33
+
+
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+
+ punpckhdq mm1, mm2 ; 01 11 21 31
+
+ movq mm2, mm4 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
+
+ punpckhdq mm4, mm5 ; 03 13 23 33
+ movq mm3, mm4
+
+
+ ; first stage
+ movq mm5, mm0
+ movq mm4, mm1
+
+ paddw mm0, mm3 ; a = 0 + 3
+ paddw mm1, mm2 ; b = 1 + 2
+
+ psubw mm4, mm2 ; c = 1 - 2
+ psubw mm5, mm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movq mm6, [rdx + 16] ; c2
+ movq mm2, mm0 ; a
+
+ paddw mm0, mm1 ; a + b
+ psubw mm2, mm1 ; a - b
+
+ movq mm1, mm0 ; a + b
+ pmulhw mm0, mm6 ; 00 01 02 03
+
+ paddw mm0, mm1 ; output 00 01 02 03
+ pmulhw mm6, mm2 ; 20 21 22 23
+
+ paddw mm2, mm6 ; output 20 21 22 23
+
+ ; output 1 and 3
+ movq mm6, [rdx + 8] ; c1
+ movq mm7, [rdx + 24] ; c3
+
+ movq mm1, mm4 ; c
+ movq mm3, mm5 ; d
+
+ pmulhw mm1, mm7 ; c * c3
+ pmulhw mm3, mm6 ; d * c1
+
+ paddw mm3, mm5 ; d * c1 rounded
+ paddw mm1, mm3 ; output 10 11 12 13
+
+ movq mm3, mm4 ; c
+ pmulhw mm5, mm7 ; d * c3
+
+ pmulhw mm4, mm6 ; c * c1
+ paddw mm3, mm4 ; round c* c1
+
+ psubw mm5, mm3 ; output 30 31 32 33
+ movq mm3, mm5
+
+
+ ; done with vertical
+ ; transpose for the second stage
+ movq mm4, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 10 11 12 03
+
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm4, mm1 ; 02 12 03 13
+
+ punpcklwd mm2, mm3 ; 20 30 21 31
+ punpckhwd mm5, mm3 ; 22 32 23 33
+
+
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+
+ punpckhdq mm1, mm2 ; 01 11 21 31
+
+ movq mm2, mm4 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
+
+ punpckhdq mm4, mm5 ; 03 13 23 33
+ movq mm3, mm4
+
+
+ ; first stage
+ movq mm5, mm0
+ movq mm4, mm1
+
+ paddw mm0, mm3 ; a = 0 + 3
+ paddw mm1, mm2 ; b = 1 + 2
+
+ psubw mm4, mm2 ; c = 1 - 2
+ psubw mm5, mm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movq mm6, [rdx + 16] ; c2
+ movq mm2, mm0 ; a
+ paddw mm0, mm1 ; a + b
+
+ psubw mm2, mm1 ; a - b
+
+ movq mm1, mm0 ; a + b
+ pmulhw mm0, mm6 ; 00 01 02 03
+
+ paddw mm0, mm1 ; output 00 01 02 03
+ pmulhw mm6, mm2 ; 20 21 22 23
+
+ paddw mm2, mm6 ; output 20 21 22 23
+
+
+ ; output 1 and 3
+ movq mm6, [rdx + 8] ; c1
+ movq mm7, [rdx + 24] ; c3
+
+ movq mm1, mm4 ; c
+ movq mm3, mm5 ; d
+
+ pmulhw mm1, mm7 ; c * c3
+ pmulhw mm3, mm6 ; d * c1
+
+ paddw mm3, mm5 ; d * c1 rounded
+ paddw mm1, mm3 ; output 10 11 12 13
+
+ movq mm3, mm4 ; c
+ pmulhw mm5, mm7 ; d * c3
+
+ pmulhw mm4, mm6 ; c * c1
+ paddw mm3, mm4 ; round c* c1
+
+ psubw mm5, mm3 ; output 30 31 32 33
+ movq mm3, mm5
+ ; done with vertical
+
+ pcmpeqw mm4, mm4
+ pcmpeqw mm5, mm5
+ psrlw mm4, 15
+ psrlw mm5, 15
+
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm4
+ paddw mm3, mm5
+
+ psraw mm0, 1
+ psraw mm1, 1
+ psraw mm2, 1
+ psraw mm3, 1
+
+ movq [rdi ], mm0
+ movq [rdi+ 8], mm1
+ movq [rdi+16], mm2
+ movq [rdi+24], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_fast_fdct8x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;input
+ mov rdi, arg(1) ;output
+
+ lea rdx, [dct_const_xmm GLOBAL]
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
+ ; read the input data
+ movdqa xmm0, [rsi]
+ movdqa xmm2, [rsi + rax]
+
+ movdqa xmm4, [rcx]
+ movdqa xmm3, [rcx + rax]
+ ; get the constants
+ ;shift to left by 1 for prescision
+ psllw xmm0, 1
+ psllw xmm2, 1
+
+ psllw xmm4, 1
+ psllw xmm3, 1
+
+ ; transpose for the second stage
+ movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
+ movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
+
+ punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
+ punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+
+ punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
+ punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+
+ movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
+ punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+
+ punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+
+
+ movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
+ punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+
+ punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
+ movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+
+ punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
+ punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
+ punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+
+ punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
+
+ ; xmm0 0
+ ; xmm1 1
+ ; xmm2 2
+ ; xmm3 3
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a = 0 + 3
+ paddw xmm1, xmm2 ; b = 1 + 2
+
+ psubw xmm4, xmm2 ; c = 1 - 2
+ psubw xmm5, xmm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movdqa xmm6, [rdx + 32] ; c2
+ movdqa xmm2, xmm0 ; a
+
+ paddw xmm0, xmm1 ; a + b
+ psubw xmm2, xmm1 ; a - b
+
+ movdqa xmm1, xmm0 ; a + b
+ pmulhw xmm0, xmm6 ; 00 01 02 03
+
+ paddw xmm0, xmm1 ; output 00 01 02 03
+ pmulhw xmm6, xmm2 ; 20 21 22 23
+
+ paddw xmm2, xmm6 ; output 20 21 22 23
+
+ ; output 1 and 3
+ movdqa xmm6, [rdx + 16] ; c1
+ movdqa xmm7, [rdx + 48] ; c3
+
+ movdqa xmm1, xmm4 ; c
+ movdqa xmm3, xmm5 ; d
+
+ pmulhw xmm1, xmm7 ; c * c3
+ pmulhw xmm3, xmm6 ; d * c1
+
+ paddw xmm3, xmm5 ; d * c1 rounded
+ paddw xmm1, xmm3 ; output 10 11 12 13
+
+ movdqa xmm3, xmm4 ; c
+ pmulhw xmm5, xmm7 ; d * c3
+
+ pmulhw xmm4, xmm6 ; c * c1
+ paddw xmm3, xmm4 ; round c* c1
+
+ psubw xmm5, xmm3 ; output 30 31 32 33
+ movdqa xmm3, xmm5
+
+
+ ; done with vertical
+ ; transpose for the second stage
+ movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36
+ movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34
+ movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36
+
+ punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31
+ punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35
+
+ punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33
+ punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+
+ movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31
+ punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13
+
+ punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33
+
+
+ movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35
+ punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17
+
+ punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37
+ movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33
+
+ punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37
+ punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27
+
+ movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13
+ punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07
+
+ punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a = 0 + 3
+ paddw xmm1, xmm2 ; b = 1 + 2
+
+ psubw xmm4, xmm2 ; c = 1 - 2
+ psubw xmm5, xmm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movdqa xmm6, [rdx + 32] ; c2
+ movdqa xmm2, xmm0 ; a
+
+ paddw xmm0, xmm1 ; a + b
+ psubw xmm2, xmm1 ; a - b
+
+ movdqa xmm1, xmm0 ; a + b
+ pmulhw xmm0, xmm6 ; 00 01 02 03
+
+ paddw xmm0, xmm1 ; output 00 01 02 03
+ pmulhw xmm6, xmm2 ; 20 21 22 23
+
+ paddw xmm2, xmm6 ; output 20 21 22 23
+
+ ; output 1 and 3
+ movdqa xmm6, [rdx + 16] ; c1
+ movdqa xmm7, [rdx + 48] ; c3
+
+ movdqa xmm1, xmm4 ; c
+ movdqa xmm3, xmm5 ; d
+
+ pmulhw xmm1, xmm7 ; c * c3
+ pmulhw xmm3, xmm6 ; d * c1
+
+ paddw xmm3, xmm5 ; d * c1 rounded
+ paddw xmm1, xmm3 ; output 10 11 12 13
+
+ movdqa xmm3, xmm4 ; c
+ pmulhw xmm5, xmm7 ; d * c3
+
+ pmulhw xmm4, xmm6 ; c * c1
+ paddw xmm3, xmm4 ; round c* c1
+
+ psubw xmm5, xmm3 ; output 30 31 32 33
+ movdqa xmm3, xmm5
+ ; done with vertical
+
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm5, xmm5;
+ psrlw xmm4, 15
+ psrlw xmm5, 15
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ psraw xmm0, 1
+ psraw xmm1, 1
+ psraw xmm2, 1
+ psraw xmm3, 1
+
+ movq QWORD PTR[rdi ], xmm0
+ movq QWORD PTR[rdi+ 8], xmm1
+ movq QWORD PTR[rdi+16], xmm2
+ movq QWORD PTR[rdi+24], xmm3
+
+ psrldq xmm0, 8
+ psrldq xmm1, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+
+ movq QWORD PTR[rdi+32], xmm0
+ movq QWORD PTR[rdi+40], xmm1
+ movq QWORD PTR[rdi+48], xmm2
+ movq QWORD PTR[rdi+56], xmm3
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;static const unsigned int dct1st_stage_rounding_mmx[2] =
+align 16
+dct1st_stage_rounding_mmx:
+ times 2 dd 8192
+
+
+;static const unsigned int dct2nd_stage_rounding_mmx[2] =
+align 16
+dct2nd_stage_rounding_mmx:
+ times 2 dd 32768
+
+
+;static const short dct_matrix[4][4]=
+align 16
+dct_matrix:
+ times 4 dw 23170
+
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+
+ dw 23170
+ times 2 dw -23170
+ dw 23170
+
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
+
+
+;static const unsigned short dct_const_mmx[4 * 4]=
+align 16
+dct_const_mmx:
+ times 4 dw 0
+ times 4 dw 60547
+ times 4 dw 46341
+ times 4 dw 25080
+
+
+;static const unsigned short dct_const_xmm[8 * 4]=
+align 16
+dct_const_xmm:
+ times 8 dw 0
+ times 8 dw 60547
+ times 8 dw 46341
+ times 8 dw 25080
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..3e5e9a70c
--- /dev/null
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,260 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_short_fdct4x4_wmt)
+
+%define DCTCONSTANTSBITS (16)
+%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
+%define x_c1 (60547) ; cos(pi /8) * (1<<15)
+%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
+%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
+
+%define _1STSTAGESHIFT 14
+%define _2NDSTAGESHIFT 16
+
+
+;; using matrix multiply
+;void vp8_short_fdct4x4_wmt(short *input, short *output)
+sym(vp8_short_fdct4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ GET_GOT rbx
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rcx, arg(1) ;output
+
+ lea rdx, [dct_matrix_sse2 GLOBAL]
+
+ movdqu xmm0, [rax ]
+ movdqu xmm1, [rax+16]
+
+ ; first column
+ movdqa xmm2, xmm0
+ movdqa xmm7, [rdx]
+
+ pmaddwd xmm2, xmm7
+ movdqa xmm3, xmm1
+
+ pmaddwd xmm3, xmm7
+ movdqa xmm4, xmm2
+
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ movdqa xmm3, xmm2
+ punpckldq xmm2, xmm4
+
+ punpckhdq xmm3, xmm4
+ paddd xmm2, xmm3
+
+
+ paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+ psrad xmm2, _1STSTAGESHIFT
+ ;second column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+16]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+16]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+ psrad xmm3, _1STSTAGESHIFT
+ packssdw xmm2, xmm3
+
+ ;third column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+32]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+32]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _1STSTAGESHIFT
+
+ ;fourth column (this is the last column, so we do not have save the source any more)
+ pmaddwd xmm0, [rdx+48]
+ pmaddwd xmm1, [rdx+48]
+
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+
+ punpckhdq xmm4, xmm1
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm4
+ punpckhdq xmm1, xmm4
+
+ paddd xmm0, xmm1
+ paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+ psrad xmm0, _1STSTAGESHIFT
+ packssdw xmm3, xmm0
+ ; done with one pass
+ ; now start second pass
+ movdqa xmm0, xmm2
+ movdqa xmm1, xmm3
+
+ pmaddwd xmm2, xmm7
+ pmaddwd xmm3, xmm7
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+
+ punpckhdq xmm4, xmm3
+ movdqa xmm3, xmm2
+
+ punpckldq xmm2, xmm4
+ punpckhdq xmm3, xmm4
+
+ paddd xmm2, xmm3
+ paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm2, _2NDSTAGESHIFT
+
+ ;second column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+16]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+16]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _2NDSTAGESHIFT
+ packssdw xmm2, xmm3
+
+ movdqu [rcx], xmm2
+ ;third column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+32]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+32]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _2NDSTAGESHIFT
+ ;fourth column
+ pmaddwd xmm0, [rdx+48]
+ pmaddwd xmm1, [rdx+48]
+
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+
+ punpckhdq xmm4, xmm1
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm4
+ punpckhdq xmm1, xmm4
+
+ paddd xmm0, xmm1
+ paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm0, _2NDSTAGESHIFT
+ packssdw xmm3, xmm0
+
+ movdqu [rcx+16], xmm3
+
+ mov rsp, rbp
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;static unsigned int dct1st_stage_rounding_sse2[4] =
+align 16
+dct1st_stage_rounding_sse2:
+ times 4 dd 8192
+
+
+;static unsigned int dct2nd_stage_rounding_sse2[4] =
+align 16
+dct2nd_stage_rounding_sse2:
+ times 4 dd 32768
+
+;static short dct_matrix_sse2[4][8]=
+align 16
+dct_matrix_sse2:
+ times 8 dw 23170
+
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+
+ dw 23170
+ times 2 dw -23170
+ times 2 dw 23170
+ times 2 dw -23170
+ dw 23170
+
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
new file mode 100644
index 000000000..bc80e64ef
--- /dev/null
+++ b/vp8/encoder/x86/dct_x86.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef DCT_X86_H
+#define DCT_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_fdct(vp8_short_fdct4x4_mmx);
+extern prototype_fdct(vp8_short_fdct8x4_mmx);
+extern prototype_fdct(vp8_fast_fdct4x4_mmx);
+extern prototype_fdct(vp8_fast_fdct8x4_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
+
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_fdct(vp8_short_fdct4x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_fast_fdct8x4_wmt);
+
+extern prototype_fdct(vp8_short_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#if 0
+/* short SSE2 DCT currently disabled, does not match the MMX version */
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#endif
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
+
+#endif
+
+
+#endif
+
+#endif
diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
new file mode 100644
index 000000000..9397a6cca
--- /dev/null
+++ b/vp8/encoder/x86/encodemb_x86.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_X86_H
+#define ENCODEMB_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_berr(vp8_block_error_mmx);
+extern prototype_mberr(vp8_mbblock_error_mmx);
+extern prototype_mbuverr(vp8_mbuverror_mmx);
+extern prototype_subb(vp8_subtract_b_mmx);
+extern prototype_submby(vp8_subtract_mby_mmx);
+extern prototype_submbuv(vp8_subtract_mbuv_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_mmx
+
+#undef vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_mmx
+
+#undef vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_mmx
+
+#undef vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_mmx
+
+#undef vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_mmx
+
+#undef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_berr(vp8_block_error_xmm);
+extern prototype_mberr(vp8_mbblock_error_xmm);
+extern prototype_mbuverr(vp8_mbuverror_xmm);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_xmm
+
+#undef vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_xmm
+
+#undef vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_xmm
+
+#endif
+#endif
+
+
+#endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 000000000..194047155
--- /dev/null
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -0,0 +1,393 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp8_block_error_xmm)
+sym(vp8_block_error_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor xmm7, xmm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ movdqa xmm3, [rsi]
+
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rsi+16]
+
+ movdqa xmm6, [rdi+16]
+ pxor xmm1, xmm1 ; from movd xmm1, dc; dc=0
+
+ movdqa xmm2, xmm7
+ psubw xmm5, xmm6
+
+ por xmm1, xmm2
+ pmaddwd xmm5, xmm5
+
+ pcmpeqw xmm1, xmm7
+ psubw xmm3, xmm4
+
+ pand xmm1, xmm3
+ pmaddwd xmm1, xmm1
+
+ paddd xmm1, xmm5
+ movdqa xmm0, xmm1
+
+ punpckldq xmm0, xmm7
+ punpckhdq xmm1, xmm7
+
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ psrldq xmm0, 8
+ paddd xmm0, xmm1
+
+ movd rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp8_block_error_mmx)
+sym(vp8_block_error_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ movq mm3, [rsi]
+
+ movq mm4, [rdi]
+ movq mm5, [rsi+8]
+
+ movq mm6, [rdi+8]
+ pxor mm1, mm1 ; from movd mm1, dc ; dc =0
+
+ movq mm2, mm7
+ psubw mm5, mm6
+
+ por mm1, mm2
+ pmaddwd mm5, mm5
+
+ pcmpeqw mm1, mm7
+ psubw mm3, mm4
+
+ pand mm1, mm3
+ pmaddwd mm1, mm1
+
+ paddd mm1, mm5
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm3, mm5
+
+ paddd mm1, mm3
+ movq mm0, mm1
+
+ psrlq mm1, 32
+ paddd mm0, mm1
+
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl)
+sym(vp8_mbblock_error_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor mm2, mm2
+
+ movd mm1, dword ptr arg(2) ;dc
+ por mm1, mm2
+
+ pcmpeqw mm1, mm7
+ mov rcx, 16
+
+mberror_loop_mmx:
+ movq mm3, [rsi]
+ movq mm4, [rdi]
+
+ movq mm5, [rsi+8]
+ movq mm6, [rdi+8]
+
+
+ psubw mm5, mm6
+ pmaddwd mm5, mm5
+
+ psubw mm3, mm4
+ pand mm3, mm1
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ add rsi, 32
+
+ add rdi, 32
+ sub rcx, 1
+
+ jnz mberror_loop_mmx
+
+ movq mm0, mm2
+ psrlq mm2, 32
+
+ paddd mm0, mm2
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl)
+sym(vp8_mbblock_error_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor xmm7, xmm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor xmm2, xmm2
+
+ movd xmm1, dword ptr arg(2) ;dc
+ por xmm1, xmm2
+
+ pcmpeqw xmm1, xmm7
+ mov rcx, 16
+
+mberror_loop:
+ movdqa xmm3, [rsi]
+ movdqa xmm4, [rdi]
+
+ movdqa xmm5, [rsi+16]
+ movdqa xmm6, [rdi+16]
+
+
+ psubw xmm5, xmm6
+ pmaddwd xmm5, xmm5
+
+ psubw xmm3, xmm4
+ pand xmm3, xmm1
+
+ pmaddwd xmm3, xmm3
+ add rsi, 32
+
+ add rdi, 32
+
+ sub rcx, 1
+ paddd xmm2, xmm5
+
+ paddd xmm2, xmm3
+ jnz mberror_loop
+
+ movdqa xmm0, xmm2
+ punpckldq xmm0, xmm7
+
+ punpckhdq xmm2, xmm7
+ paddd xmm0, xmm2
+
+ movdqa xmm1, xmm0
+ psrldq xmm0, 8
+
+ paddd xmm0, xmm1
+ movd rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl)
+sym(vp8_mbuverror_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor mm7, mm7
+
+mbuverror_loop_mmx:
+
+ movq mm1, [rsi]
+ movq mm2, [rdi]
+
+ psubw mm1, mm2
+ pmaddwd mm1, mm1
+
+
+ movq mm3, [rsi+8]
+ movq mm4, [rdi+8]
+
+ psubw mm3, mm4
+ pmaddwd mm3, mm3
+
+
+ paddd mm7, mm1
+ paddd mm7, mm3
+
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz mbuverror_loop_mmx
+
+ movq mm0, mm7
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl)
+sym(vp8_mbuverror_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor xmm7, xmm7
+
+mbuverror_loop:
+
+ movdqa xmm1, [rsi]
+ movdqa xmm2, [rdi]
+
+ psubw xmm1, xmm2
+ pmaddwd xmm1, xmm1
+
+ paddd xmm7, xmm1
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz mbuverror_loop
+
+ pxor xmm0, xmm0
+ movdqa xmm1, xmm7
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ paddd xmm1, xmm2
+
+ movdqa xmm2, xmm1
+
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ movd rax, xmm1
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 000000000..7d8620178
--- /dev/null
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -0,0 +1,117 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2)
+sym(vp8_short_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0)
+ mov rdi, arg(1)
+
+ movdqu xmm4, [rsi + 0] ;ip[4] ip[0]
+ movdqu xmm0, [rsi + 16] ;ip[12] ip[8]
+
+ pxor xmm7, xmm7
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm5 ;ip[4] ip[0]
+
+ paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm3 ;d1 a1
+ punpckhqdq xmm6, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm6 ;c1 b1
+ paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ movdqa xmm0, xmm6 ;aka b2 a2
+ movdqa xmm1, xmm5 ;aka d2 c2
+
+ pcmpgtw xmm0, xmm7
+ pcmpgtw xmm1, xmm7
+
+ psrlw xmm0, 15
+ psrlw xmm1, 15
+
+ paddw xmm6, xmm0
+ paddw xmm5, xmm1
+
+ psraw xmm6, 1
+ psraw xmm5, 1
+
+ ; a2 = a1 + b1;
+ ; b2 = c1 + d1;
+ ; c2 = a1 - b1;
+ ; d2 = d1 - c1;
+ ; a2 += (a2>0);
+ ; b2 += (b2>0);
+ ; c2 += (c2>0);
+ ; d2 += (d2>0);
+ ; op[0] = (a2)>>1;
+ ; op[4] = (b2)>>1;
+ ; op[8] = (c2)>>1;
+ ; op[12]= (d2)>>1;
+
+ movdqu [rdi + 0], xmm6
+ movdqu [rdi + 16], xmm5
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
new file mode 100644
index 000000000..5661491ad
--- /dev/null
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef MCOMP_X86_H
+#define MCOMP_X86_H
+
+#if HAVE_SSE3
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx3
+
+#undef vp8_search_diamond_search
+#define vp8_search_diamond_search vp8_diamond_search_sadx4
+
+#endif
+#endif
+
+#endif
+
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
new file mode 100644
index 000000000..69617ca47
--- /dev/null
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "memory.h"
+#include "preproc.h"
+#include "pragmas.h"
+
+/****************************************************************************
+* Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+* Imports
+****************************************************************************/
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+* Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+/****************************************************************************
+ *
+ * ROUTINE : temp_filter_wmt
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ * unsigned char *s : Pointer to source frame.
+ * unsigned char *d : Pointer to destination frame.
+ * int bytes : Number of bytes to filter.
+ * int strength : Strength of filter to apply.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs a closesness adjusted temporarl blur
+ *
+ * SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_wmt
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+)
+{
+ int byte = 0;
+ unsigned char *frameptr = ppi->frame_buffer;
+
+ __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
+ __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
+
+ if (ppi->frame == 0)
+ {
+ do
+ {
+ int i;
+ int frame = 0;
+
+ do
+ {
+ for (i = 0; i < 8; i++)
+ {
+ *frameptr = s[byte+i];
+ ++frameptr;
+ }
+
+ ++frame;
+ }
+ while (frame < FRAMECOUNT);
+
+ for (i = 0; i < 8; i++)
+ d[byte+i] = s[byte+i];
+
+ byte += 8;
+
+ }
+ while (byte < bytes);
+ }
+ else
+ {
+ int i;
+ int offset2 = (ppi->frame % FRAMECOUNT);
+
+ do
+ {
+ __declspec(align(16)) unsigned short counts[8];
+ __declspec(align(16)) unsigned short sums[8];
+ __asm
+ {
+ mov eax, offset2
+ mov edi, s // source pixels
+ pxor xmm1, xmm1 // accumulator
+
+ pxor xmm7, xmm7
+
+ mov esi, frameptr // accumulator
+ pxor xmm2, xmm2 // count
+
+ movq xmm3, QWORD PTR [edi]
+
+ movq QWORD PTR [esi+8*eax], xmm3
+
+ punpcklbw xmm3, xmm2 // xmm3 source pixels
+ mov ecx, FRAMECOUNT
+
+ next_frame:
+ movq xmm4, QWORD PTR [esi] // get frame buffer values
+ punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
+ movdqa xmm6, xmm4 // save the pixel values
+ psubsw xmm4, xmm3 // subtracted pixel values
+ pmullw xmm4, xmm4 // square xmm4
+ movd xmm5, strength
+ psrlw xmm4, xmm5 // should be strength
+ pmullw xmm4, threes // 3 * modifier
+ movdqa xmm5, sixteens // 16s
+ psubusw xmm5, xmm4 // 16 - modifiers
+ movdqa xmm4, xmm5 // save the modifiers
+ pmullw xmm4, xmm6 // multiplier values
+ paddusw xmm1, xmm4 // accumulator
+ paddusw xmm2, xmm5 // count
+ add esi, 8 // next frame
+ dec ecx // next set of eight pixels
+ jnz next_frame
+
+ movdqa counts, xmm2
+ psrlw xmm2, 1 // divide count by 2 for rounding
+ paddusw xmm1, xmm2 // rounding added in
+
+ mov frameptr, esi
+
+ movdqa sums, xmm1
+ }
+
+ for (i = 0; i < 8; i++)
+ {
+ int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+ blurvalue >>= 16;
+ d[i] = blurvalue;
+ }
+
+ s += 8;
+ d += 8;
+ byte += 8;
+ }
+ while (byte < bytes);
+ }
+
+ ++ppi->frame;
+ __asm emms
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : temp_filter_mmx
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ * unsigned char *s : Pointer to source frame.
+ * unsigned char *d : Pointer to destination frame.
+ * int bytes : Number of bytes to filter.
+ * int strength : Strength of filter to apply.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs a closesness adjusted temporarl blur
+ *
+ * SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_mmx
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+)
+{
+ int byte = 0;
+ unsigned char *frameptr = ppi->frame_buffer;
+
+ __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3};
+ __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
+
+ if (ppi->frame == 0)
+ {
+ do
+ {
+ int i;
+ int frame = 0;
+
+ do
+ {
+ for (i = 0; i < 4; i++)
+ {
+ *frameptr = s[byte+i];
+ ++frameptr;
+ }
+
+ ++frame;
+ }
+ while (frame < FRAMECOUNT);
+
+ for (i = 0; i < 4; i++)
+ d[byte+i] = s[byte+i];
+
+ byte += 4;
+
+ }
+ while (byte < bytes);
+ }
+ else
+ {
+ int i;
+ int offset2 = (ppi->frame % FRAMECOUNT);
+
+ do
+ {
+ __declspec(align(16)) unsigned short counts[8];
+ __declspec(align(16)) unsigned short sums[8];
+ __asm
+ {
+
+ mov eax, offset2
+ mov edi, s // source pixels
+ pxor mm1, mm1 // accumulator
+ pxor mm7, mm7
+
+ mov esi, frameptr // accumulator
+ pxor mm2, mm2 // count
+
+ movd mm3, DWORD PTR [edi]
+ movd DWORD PTR [esi+4*eax], mm3
+
+ punpcklbw mm3, mm2 // mm3 source pixels
+ mov ecx, FRAMECOUNT
+
+ next_frame:
+ movd mm4, DWORD PTR [esi] // get frame buffer values
+ punpcklbw mm4, mm7 // mm4 frame buffer pixels
+ movq mm6, mm4 // save the pixel values
+ psubsw mm4, mm3 // subtracted pixel values
+ pmullw mm4, mm4 // square mm4
+ movd mm5, strength
+ psrlw mm4, mm5 // should be strength
+ pmullw mm4, threes // 3 * modifier
+ movq mm5, sixteens // 16s
+ psubusw mm5, mm4 // 16 - modifiers
+ movq mm4, mm5 // save the modifiers
+ pmullw mm4, mm6 // multiplier values
+ paddusw mm1, mm4 // accumulator
+ paddusw mm2, mm5 // count
+ add esi, 4 // next frame
+ dec ecx // next set of eight pixels
+ jnz next_frame
+
+ movq counts, mm2
+ psrlw mm2, 1 // divide count by 2 for rounding
+ paddusw mm1, mm2 // rounding added in
+
+ mov frameptr, esi
+
+ movq sums, mm1
+
+ }
+
+ for (i = 0; i < 4; i++)
+ {
+ int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+ blurvalue >>= 16;
+ d[i] = blurvalue;
+ }
+
+ s += 4;
+ d += 4;
+ byte += 4;
+ }
+ while (byte < bytes);
+ }
+
+ ++ppi->frame;
+ __asm emms
+}
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 000000000..847fc6e37
--- /dev/null
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -0,0 +1,438 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx)
+sym(vp8_fast_quantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ movq mm0, [rsi]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm1, [rax]
+
+ movq mm3, mm0
+ psraw mm0, 15
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ; abs
+
+ movq mm2, mm3
+ pcmpgtw mm1, mm2
+
+ pandn mm1, mm2
+ movq mm3, mm1
+
+ mov rdx, arg(6) ;quant_ptr
+ movq mm1, [rdx]
+
+ mov rcx, arg(5) ;round_ptr
+ movq mm2, [rcx]
+
+ paddw mm3, mm2
+ pmulhuw mm3, mm1
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ movq mm0, mm3
+
+ movq [rdi], mm3
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm2, [rax]
+
+ pmullw mm3, mm2
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax], mm3
+
+ ; next 8
+ movq mm4, [rsi+8]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+8]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+8]
+ movq mm6, [rcx+8]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+8], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+8]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+8], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+16]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+16]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+16]
+ movq mm6, [rcx+16]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+16], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+16]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+16], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+24]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+24]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+24]
+ movq mm6, [rcx+24]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+24], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+24]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+24], mm7
+
+
+
+ mov rdi, arg(4) ;scan_mask
+ mov rsi, arg(2) ;qcoeff_ptr
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rsi+8]
+
+ movq mm2, [rdi]
+ movq mm3, [rdi+8];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ movq mm5, mm0
+
+ paddd mm5, mm1
+
+ movq mm0, [rsi+16]
+ movq mm1, [rsi+24]
+
+ movq mm2, [rdi+16]
+ movq mm3, [rdi+24];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ paddd mm5, mm0
+
+ paddd mm5, mm1
+ movq mm0, mm5
+
+ psrlq mm5, 32
+ paddd mm0, mm5
+
+ ; eob adjustment begins here
+ movd rcx, mm0
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx ; rdx=-rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+ ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+ ; following is kept as reference
+ ; movd rcx, mm0
+ ; bsr rax, rcx
+ ;
+ ; mov eob, rax
+ ; mov eee, rcx
+ ;
+ ;if(eee==0)
+ ;{
+ ; eob=-1;
+ ;}
+ ;else if(eee<0)
+ ;{
+ ; eob=15;
+ ;}
+ ;d->eob = eob+1;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse)
+sym(vp8_fast_quantize_b_impl_sse):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ movdqa xmm0, [rsi]
+
+ mov rax, arg(1) ;zbin_ptr
+ movdqa xmm1, [rax]
+
+ movdqa xmm3, xmm0
+ psraw xmm0, 15
+
+ pxor xmm3, xmm0
+ psubw xmm3, xmm0 ; abs
+
+ movdqa xmm2, xmm3
+ pcmpgtw xmm1, xmm2
+
+ pandn xmm1, xmm2
+ movdqa xmm3, xmm1
+
+ mov rdx, arg(6) ; quant_ptr
+ movdqa xmm1, [rdx]
+
+ mov rcx, arg(5) ; round_ptr
+ movdqa xmm2, [rcx]
+
+ paddw xmm3, xmm2
+ pmulhuw xmm3, xmm1
+
+ pxor xmm3, xmm0
+ psubw xmm3, xmm0 ;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ movdqa xmm0, xmm3
+
+ movdqa [rdi], xmm3
+
+ mov rax, arg(3) ;dequant_ptr
+ movdqa xmm2, [rax]
+
+ pmullw xmm3, xmm2
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movdqa [rax], xmm3
+
+ ; next 8
+ movdqa xmm4, [rsi+16]
+
+ mov rax, arg(1) ;zbin_ptr
+ movdqa xmm5, [rax+16]
+
+ movdqa xmm7, xmm4
+ psraw xmm4, 15
+
+ pxor xmm7, xmm4
+ psubw xmm7, xmm4 ; abs
+
+ movdqa xmm6, xmm7
+ pcmpgtw xmm5, xmm6
+
+ pandn xmm5, xmm6
+ movdqa xmm7, xmm5
+
+ movdqa xmm5, [rdx+16]
+ movdqa xmm6, [rcx+16]
+
+
+ paddw xmm7, xmm6
+ pmulhuw xmm7, xmm5
+
+ pxor xmm7, xmm4
+ psubw xmm7, xmm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movdqa xmm1, xmm7
+ movdqa [rdi+16], xmm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movdqa xmm6, [rax+16]
+
+ pmullw xmm7, xmm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movdqa [rax+16], xmm7
+ mov rdi, arg(4) ;scan_mask
+
+ pxor xmm7, xmm7
+ movdqa xmm2, [rdi]
+
+ movdqa xmm3, [rdi+16];
+ pcmpeqw xmm0, xmm7
+
+ pcmpeqw xmm1, xmm7
+ pcmpeqw xmm6, xmm6
+
+ pxor xmm0, xmm6
+ pxor xmm1, xmm6
+
+ psrlw xmm0, 15
+ psrlw xmm1, 15
+
+ pmaddwd xmm0, xmm2
+ pmaddwd xmm1, xmm3
+
+ movq xmm2, xmm0
+ movq xmm3, xmm1
+
+ psrldq xmm0, 8
+ psrldq xmm1, 8
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+
+ paddd xmm0, xmm2
+ movq xmm1, xmm0
+
+ psrldq xmm0, 4
+ paddd xmm1, xmm0
+
+ movd rcx, xmm1
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
new file mode 100644
index 000000000..a825698e7
--- /dev/null
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -0,0 +1,428 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx)
+global sym(vp8_sad8x16_mmx)
+global sym(vp8_sad8x8_mmx)
+global sym(vp8_sad4x4_mmx)
+global sym(vp8_sad16x8_mmx)
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x16x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpcklbw mm2, mm6
+
+ punpckhbw mm1, mm6
+ punpckhbw mm3, mm6
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ paddw mm7, mm1
+
+ cmp rsi, rcx
+ jne x16x16sad_mmx_loop
+
+
+ movq mm0, mm7
+
+ punpcklwd mm0, mm6
+ punpckhwd mm7, mm6
+
+ paddw mm0, mm7
+ movq mm7, mm0
+
+
+ psrlq mm0, 32
+ paddw mm7, mm0
+
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x8x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ paddw mm7, mm2
+ cmp rsi, rcx
+
+ jne x8x16sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x8x8sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ cmp rsi, rcx
+
+ jne x8x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rdi]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ pxor mm3, mm3
+
+ punpcklbw mm0, mm3
+ punpckhbw mm2, mm3
+
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm4, QWORD PTR [rsi]
+ movd mm5, QWORD PTR [rdi]
+
+ movd mm6, QWORD PTR [rsi+rax]
+ movd mm7, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+
+ movq mm6, mm4
+ psubusb mm4, mm5
+
+ psubusb mm5, mm6
+ por mm4, mm5
+
+ movq mm5, mm4
+ punpcklbw mm4, mm3
+
+ punpckhbw mm5, mm3
+ paddw mm4, mm5
+
+ paddw mm0, mm4
+ movq mm1, mm0
+
+ punpcklwd mm0, mm3
+ punpckhwd mm1, mm3
+
+ paddw mm0, mm1
+ movq mm1, mm0
+
+ psrlq mm0, 32
+ paddw mm0, mm1
+
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x16x8sad_mmx_loop:
+
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+
+ movq mm2, [rsi+8]
+ movq mm3, [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpckhbw mm1, mm6
+
+ punpcklbw mm2, mm6
+ punpckhbw mm3, mm6
+
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+ paddw mm0, mm1
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne x16x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
new file mode 100644
index 000000000..53240bbf1
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -0,0 +1,329 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x16_wmt)
+sym(vp8_sad16x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor xmm7, xmm7
+
+x16x16sad_wmt_loop:
+
+ movq xmm0, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rsi+8]
+
+ movq xmm1, QWORD PTR [rdi]
+ movq xmm3, QWORD PTR [rdi+8]
+
+ movq xmm4, QWORD PTR [rsi+rax]
+ movq xmm5, QWORD PTR [rdi+rdx]
+
+
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm3
+
+ psadbw xmm0, xmm1
+ movq xmm6, QWORD PTR [rsi+rax+8]
+
+ movq xmm3, QWORD PTR [rdi+rdx+8]
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ punpcklbw xmm4, xmm6
+
+ punpcklbw xmm5, xmm3
+ psadbw xmm4, xmm5
+
+ paddw xmm7, xmm0
+ paddw xmm7, xmm4
+
+ cmp rsi, rcx
+ jne x16x16sad_wmt_loop
+
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd rax, xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad8x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_err)
+global sym(vp8_sad8x16_wmt)
+sym(vp8_sad8x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+x8x16sad_wmt_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg x8x16sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, QWORD PTR [rsi+rbx]
+ movq mm3, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm7, mm0
+ paddw mm7, mm2
+
+ cmp rsi, rcx
+ jne x8x16sad_wmt_loop
+
+ movd rax, mm7
+
+x8x16sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad8x8_wmt)
+sym(vp8_sad8x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+x8x8sad_wmt_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg x8x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rbx]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne x8x8sad_wmt_loop
+
+ movd rax, mm7
+x8x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad4x4_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad4x4_wmt)
+sym(vp8_sad4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rdi]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ movd mm4, QWORD PTR [rsi]
+
+ movd mm5, QWORD PTR [rdi]
+ movd mm6, QWORD PTR [rsi+rax]
+
+ movd mm7, QWORD PTR [rdi+rdx]
+ punpcklbw mm4, mm6
+
+ punpcklbw mm5, mm7
+ psadbw mm4, mm5
+
+ paddw mm0, mm4
+ movd rax, mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x8_wmt)
+sym(vp8_sad16x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+x16x8sad_wmt_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg x16x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne x16x8sad_wmt_loop
+
+ movd rax, mm7
+
+x16x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
new file mode 100644
index 000000000..38cc02957
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -0,0 +1,939 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, [rsi]
+ lddqu xmm5, [rdi]
+ lddqu xmm6, [rdi+1]
+ lddqu xmm7, [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ lddqu xmm1, [rdi]
+ lddqu xmm2, [rdi+1]
+ lddqu xmm3, [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ lddqu xmm1, QWORD PTR [rdi+rdx]
+ lddqu xmm2, QWORD PTR [rdi+rdx+1]
+ lddqu xmm3, QWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 1
+%if %1
+ movq mm0, [rsi]
+ movq mm5, [rdi]
+ movq mm6, [rdi+1]
+ movq mm7, [rdi+2]
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+ movq mm2, [rdi+1]
+ movq mm3, [rdi+2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endif
+ movq mm0, QWORD PTR [rsi+rax]
+ movq mm1, QWORD PTR [rdi+rdx]
+ movq mm2, QWORD PTR [rdi+rdx+1]
+ movq mm3, QWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+ mov %2, [%1+REG_SZ_BYTES*0]
+ mov %3, [%1+REG_SZ_BYTES*1]
+
+ mov %4, [%1+REG_SZ_BYTES*2]
+ mov %5, [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 1
+%if %1
+ movdqa xmm0, [rsi]
+ lddqu xmm4, [rcx]
+ lddqu xmm5, [rdx]
+ lddqu xmm6, [rbx]
+ lddqu xmm7, [rdi]
+
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ lddqu xmm1, [rcx]
+ lddqu xmm2, [rdx]
+ lddqu xmm3, [rbx]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, [rdi]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ lddqu xmm1, QWORD PTR [rcx+rbp]
+ lddqu xmm2, QWORD PTR [rdx+rbp]
+ lddqu xmm3, QWORD PTR [rbx+rbp]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, QWORD PTR [rdi+rbp]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rcx, [rcx+rbp*2]
+
+ lea rdx, [rdx+rbp*2]
+ lea rbx, [rbx+rbp*2]
+
+ lea rdi, [rdi+rbp*2]
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 1
+%if %1
+ movq mm0, [rsi]
+ movq mm4, [rcx]
+ movq mm5, [rdx]
+ movq mm6, [rbx]
+ movq mm7, [rdi]
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, [rsi]
+ movq mm1, [rcx]
+ movq mm2, [rdx]
+ movq mm3, [rbx]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, [rdi]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+%endif
+ movq mm0, QWORD PTR [rsi+rax]
+ movq mm1, QWORD PTR [rcx+rbp]
+ movq mm2, QWORD PTR [rdx+rbp]
+ movq mm3, QWORD PTR [rbx+rbp]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [rdi+rbp]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ lea rsi, [rsi+rax*2]
+ lea rcx, [rcx+rbp*2]
+
+ lea rdx, [rdx+rbp*2]
+ lea rbx, [rbx+rbp*2]
+
+ lea rdi, [rdi+rbp*2]
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_sse3)
+sym(vp8_sad16x16x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad16x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_sse3)
+sym(vp8_sad16x8x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x3_sse3)
+sym(vp8_sad8x16x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X3 1
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm5
+ movd [rdi+4], mm6
+ movd [rdi+8], mm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x3_sse3)
+sym(vp8_sad8x8x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X3 1
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm5
+ movd [rdi+4], mm6
+ movd [rdi+8], mm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad4x4x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x3_sse3)
+sym(vp8_sad4x4x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rdi]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, QWORD PTR [rdi+1]
+ movd mm5, QWORD PTR [rdi+2]
+
+ movd mm2, QWORD PTR [rdi+rdx+1]
+ movd mm3, QWORD PTR [rdi+rdx+2]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+
+
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm2, QWORD PTR [rdi]
+
+ movd mm3, QWORD PTR [rsi+rax]
+ movd mm6, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm6
+
+ movd mm3, QWORD PTR [rdi+1]
+ movd mm7, QWORD PTR [rdi+2]
+
+ psadbw mm2, mm0
+
+ paddw mm1, mm2
+
+ movd mm2, QWORD PTR [rdi+rdx+1]
+ movd mm6, QWORD PTR [rdi+rdx+2]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm6
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ mov rdi, arg(4) ;Results
+ movd [rdi], mm1
+
+ movd [rdi+4], mm3
+ movd [rdi+8], mm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad16x16_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_err)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3)
+sym(vp8_sad16x16_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+vp8_sad16x16_sse3_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg vp8_sad16x16_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne vp8_sad16x16_sse3_loop
+
+ movd rax, mm7
+
+vp8_sad16x16_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_sad16x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x4d_sse3)
+sym(vp8_sad16x16x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_16X2X4 1
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+12], xmm0
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_sad16x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x4d_sse3)
+sym(vp8_sad16x8x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_16X2X4 1
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+12], xmm0
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x4d_sse3)
+sym(vp8_sad8x16x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_8X2X4 1
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm4
+ movd [rdi+4], mm5
+ movd [rdi+8], mm6
+ movd [rdi+12], mm7
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x4d_sse3)
+sym(vp8_sad8x8x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_8X2X4 1
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm4
+ movd [rdi+4], mm5
+ movd [rdi+8], mm6
+ movd [rdi+12], mm7
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad4x4x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x4d_sse3)
+sym(vp8_sad4x4x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rcx]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rcx+rbp]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, QWORD PTR [rdx]
+ movd mm5, QWORD PTR [rbx]
+
+ movd mm6, QWORD PTR [rdi]
+ movd mm2, QWORD PTR [rdx+rbp]
+
+ movd mm3, QWORD PTR [rbx+rbp]
+ movd mm7, QWORD PTR [rdi+rbp]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ punpcklbw mm6, mm7
+ psadbw mm4, mm0
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+
+
+
+ lea rsi, [rsi+rax*2]
+ lea rcx, [rcx+rbp*2]
+
+ lea rdx, [rdx+rbp*2]
+ lea rbx, [rbx+rbp*2]
+
+ lea rdi, [rdi+rbp*2]
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm2, QWORD PTR [rcx]
+
+ movd mm3, QWORD PTR [rsi+rax]
+ movd mm7, QWORD PTR [rcx+rbp]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm7
+
+ movd mm3, QWORD PTR [rdx]
+ movd mm7, QWORD PTR [rbx]
+
+ psadbw mm2, mm0
+ mov rax, rbp
+
+ pop rbp
+ mov rsi, arg(4) ;Results
+
+ paddw mm1, mm2
+ movd [rsi], mm1
+
+ movd mm2, QWORD PTR [rdx+rax]
+ movd mm1, QWORD PTR [rbx+rax]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm1
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ movd mm2, QWORD PTR [rdi]
+ movd mm1, QWORD PTR [rdi+rax]
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ movd [rsi+4], mm3
+ punpcklbw mm2, mm1
+
+ movd [rsi+8], mm7
+ psadbw mm2, mm0
+
+ paddw mm2, mm6
+ movd [rsi+12], mm2
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
new file mode 100644
index 000000000..1bb956121
--- /dev/null
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -0,0 +1,367 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, [rsi]
+ lddqu xmm5, [rdi]
+ lddqu xmm6, [rdi+1]
+ lddqu xmm7, [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ lddqu xmm1, [rdi]
+ lddqu xmm2, [rdi+1]
+ lddqu xmm3, [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ lddqu xmm1, QWORD PTR [rdi+rdx]
+ lddqu xmm2, QWORD PTR [rdi+rdx+1]
+ lddqu xmm3, QWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+ movdqa xmm0, [rsi]
+ movdqa xmm4, [rdi]
+ movdqa xmm7, [rdi+16]
+
+ movdqa xmm5, xmm7
+ palignr xmm5, xmm4, %2
+
+ movdqa xmm6, xmm7
+ palignr xmm6, xmm4, (%2+1)
+
+ palignr xmm7, xmm4, (%2+2)
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ movdqa xmm4, [rdi]
+ movdqa xmm3, [rdi+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ movdqa xmm4, QWORD PTR [rdi+rdx]
+ movdqa xmm3, QWORD PTR [rdi+rdx+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_ssse3)
+sym(vp8_sad16x16x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp vp8_sad16x16x3_ssse3_skiptable
+vp8_sad16x16x3_ssse3_jumptable:
+ dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_skiptable:
+
+ call vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+
+vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+vp8_sad16x16x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad16x8x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_ssse3)
+sym(vp8_sad16x8x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp vp8_sad16x8x3_ssse3_skiptable
+vp8_sad16x8x3_ssse3_jumptable:
+ dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_skiptable:
+
+ call vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+
+vp8_sad16x8x3_ssse3_aligned_by_15:
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+vp8_sad16x8x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
new file mode 100644
index 000000000..ce3e61066
--- /dev/null
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -0,0 +1,431 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+; unsigned short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl)
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi], mm0
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2],mm0
+
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_mmx)
+sym(vp8_subtract_mby_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(1) ;src
+ mov rdi, arg(0) ;diff
+
+ mov rax, arg(2) ;pred
+ movsxd rdx, dword ptr arg(3) ;stride
+
+ mov rcx, 16
+ pxor mm0, mm0
+
+submby_loop:
+
+ movq mm1, [rsi]
+ movq mm3, [rax]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi], mm1
+ movq [rdi+8], mm2
+
+
+ movq mm1, [rsi+8]
+ movq mm3, [rax+8]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi+16], mm1
+ movq [rdi+24], mm2
+
+
+ add rdi, 32
+ add rax, 16
+
+ lea rsi, [rsi+rdx]
+
+ sub rcx, 1
+ jnz submby_loop
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx)
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;short *udiff = diff + 256;
+ ;short *vdiff = diff + 320;
+ ;unsigned char *upred = pred + 256;
+ ;unsigned char *vpred = pred + 320;
+
+ ;unsigned char *z = usrc;
+ ;unsigned short *diff = udiff;
+ ;unsigned char *Predictor= upred;
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(1) ;z = usrc
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ add rax, 256 ;Predictor = pred + 256
+ movsxd rdx, dword ptr arg(4) ;stride;
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+ ;unsigned char *z = vsrc;
+ ;unsigned short *diff = vdiff;
+ ;unsigned char *Predictor= vpred;
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(2) ;z = usrc
+ add rdi, 320*2 ;diff = diff + 320 (shorts)
+ add rax, 320 ;Predictor = pred + 320
+ movsxd rdx, dword ptr arg(4) ;stride;
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
new file mode 100644
index 000000000..d0da82ad4
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -0,0 +1,980 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx)
+sym(vp8_get_mb_ss_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 8
+ ; end prolog
+
+ mov rax, arg(0) ;src_ptr
+ mov rcx, 16
+ pxor mm4, mm4
+
+NEXTROW:
+ movq mm0, [rax]
+ movq mm1, [rax+8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+
+ paddd mm4, mm0
+ paddd mm4, mm1
+ paddd mm4, mm2
+ paddd mm4, mm3
+
+ add rax, 32
+ dec rcx
+ ja NEXTROW
+ movq QWORD PTR [rsp], mm4
+
+ ;return sum[0]+sum[1];
+ movsxd rax, dword ptr [rsp]
+ movsxd rcx, dword ptr [rsp+4]
+ add rax, rcx
+
+
+ ; begin epilog
+ add rsp, 8
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get8x8var_mmx)
+sym(vp8_get8x8var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 5
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ ; movq mm4, [rbx + rdx]
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 6
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 7
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 8
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get4x4var_mmx)
+sym(vp8_get4x4var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx)
+sym(vp8_get4x4sse_cs_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+ ; Row 1
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 2
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm1, mm6
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movd rax, mm0
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%define mmx_filter_shift 7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx)
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+
+ mov rax, arg(4) ;HFilter ;
+ mov rdx, arg(5) ;VFilter ;
+
+ mov rsi, arg(0) ;ref_ptr ;
+ mov rdi, arg(2) ;src_ptr ;
+
+ mov rcx, 4 ;
+ pxor mm0, mm0 ;
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm5, mm1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+
+filter_block2d_bil4x4_var_mmx_loop:
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm3, mm5 ;
+
+ movq mm5, mm1 ;
+ pmullw mm3, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ paddw mm1, mm3 ;
+
+
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+ psraw mm1, mmx_filter_shift ;
+
+ movd mm3, [rdi] ;
+ punpcklbw mm3, mm0 ;
+
+ psubw mm1, mm3 ;
+ paddw mm6, mm1 ;
+
+ pmaddwd mm1, mm1 ;
+ paddd mm7, mm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz filter_block2d_bil4x4_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(6) ;sum
+ mov rsi, arg(7) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx)
+sym(vp8_filter_block2d_bil_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor mm0, mm0 ;
+ movq mm1, [rsi] ;
+
+ movq mm3, [rsi+1] ;
+ movq mm2, mm1 ;
+
+ movq mm4, mm3 ;
+ punpcklbw mm1, mm0 ;
+
+ punpckhbw mm2, mm0 ;
+ pmullw mm1, [rax] ;
+
+ pmullw mm2, [rax] ;
+ punpcklbw mm3, mm0 ;
+
+ punpckhbw mm4, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ pmullw mm4, [rax+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm2, mm4 ;
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ paddw mm2, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm2, mmx_filter_shift ;
+ movq mm5, mm1
+
+ packuswb mm5, mm2 ;
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ add rsi, r8
+%endif
+
+filter_block2d_bil_var_mmx_loop:
+
+ movq mm1, [rsi] ;
+ movq mm3, [rsi+1] ;
+
+ movq mm2, mm1 ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm1, mm0 ;
+ punpckhbw mm2, mm0 ;
+
+ pmullw mm1, [rax] ;
+ pmullw mm2, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, [rax+8] ;
+ pmullw mm4, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+ psraw mm1, mmx_filter_shift ;
+
+ paddw mm2, [mmx_bi_rd GLOBAL] ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, mm5 ;
+ movq mm4, mm5 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ movq mm5, mm1 ;
+ packuswb mm5, mm2 ;
+
+ pmullw mm3, [rdx] ;
+ pmullw mm4, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ pmullw mm2, [rdx+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm2, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, [rdi] ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ psubw mm1, mm3 ;
+ psubw mm2, mm4 ;
+
+ paddw mm6, mm1 ;
+ pmaddwd mm1, mm1 ;
+
+ paddw mm6, mm2 ;
+ pmaddwd mm2, mm2 ;
+
+ paddd mm7, mm1 ;
+ paddd mm7, mm2 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(7) ;sum
+ mov rsi, arg(8) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_get16x16pred_error_mmx
+;(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride
+;)
+global sym(vp8_get16x16pred_error_mmx)
+sym(vp8_get16x16pred_error_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;DWORD PTR [src_ptr]
+ mov rdi, arg(2) ;DWORD PTR [ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+
+ pxor mm0, mm0 ; clear xmm0 for unpack
+ pxor mm7, mm7 ; clear xmm7 for accumulating diffs
+
+ pxor mm6, mm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+var16loop:
+
+ movq mm1, [rsi]
+ movq mm2, [rdi]
+
+ movq mm3, mm1
+ movq mm4, mm2
+
+ punpcklbw mm1, mm0
+ punpckhbw mm3, mm0
+
+ punpcklbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm2
+ psubw mm3, mm4
+
+ paddw mm7, mm1
+ pmaddwd mm1, mm1
+
+ paddw mm7, mm3
+ pmaddwd mm3, mm3
+
+ paddd mm6, mm1
+ paddd mm6, mm3
+
+
+ movq mm1, [rsi+8]
+ movq mm2, [rdi+8]
+
+ movq mm3, mm1
+ movq mm4, mm2
+
+ punpcklbw mm1, mm0
+ punpckhbw mm3, mm0
+
+ punpcklbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm2
+ psubw mm3, mm4
+
+ paddw mm7, mm1
+ pmaddwd mm1, mm1
+
+ paddw mm7, mm3
+ pmaddwd mm3, mm3
+
+ paddd mm6, mm1
+ paddd mm6, mm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz var16loop
+
+
+ movq mm1, mm6
+ pxor mm6, mm6
+
+ pxor mm5, mm5
+ punpcklwd mm6, mm7
+
+ punpckhwd mm5, mm7
+ psrad mm5, 16
+
+ psrad mm6, 16
+ paddd mm6, mm5
+
+ movq mm2, mm1
+ psrlq mm1, 32
+
+ paddd mm2, mm1
+ movq mm7, mm6
+
+ psrlq mm6, 32
+ paddd mm6, mm7
+
+ movd DWORD PTR [rsp], mm6 ;Sum
+ movd DWORD PTR [rsp+4], mm2 ;SSE
+
+ ; return (SSE-((Sum*Sum)>>8));
+ movsxd rdx, dword ptr [rsp]
+ imul rdx, rdx
+ sar rdx, 8
+ movsxd rax, dword ptr [rsp + 4]
+ sub rax, rdx
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+ times 4 dw 64
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
new file mode 100644
index 000000000..7e5ee284b
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -0,0 +1,975 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+; short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2)
+sym(vp8_get_mb_ss_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 1
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ mov rax, arg(0) ;[src_ptr]
+ mov rcx, 8
+ pxor xmm4, xmm4
+
+NEXTROW:
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax+16]
+ movdqa xmm2, [rax+32]
+ movdqa xmm3, [rax+48]
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+ paddd xmm4, xmm0
+ paddd xmm4, xmm2
+
+ add rax, 0x40
+ dec rcx
+ ja NEXTROW
+
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,4
+ paddd xmm4,xmm3
+ movd rax,xmm4
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get16x16var_sse2)
+sym(vp8_get16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz var16loop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd DWORD PTR [rax], xmm7
+ movd DWORD PTR [rdi], xmm1
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get16x16pred_error_sse2
+;(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride
+;)
+global sym(vp8_get16x16pred_error_sse2)
+sym(vp8_get16x16pred_error_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+var16peloop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz var16peloop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ movd DWORD PTR [rsp], xmm7 ;Sum
+ movd DWORD PTR [rsp+4], xmm1 ;SSE
+
+ ; return (SSE-((Sum*Sum)>>8));
+ movsxd rdx, dword ptr [rsp]
+ imul rdx, rdx
+ sar rdx, 8
+ movsxd rax, dword ptr [rsp + 4]
+ sub rax, rdx
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get8x8var_sse2)
+sym(vp8_get8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ movq xmm1, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rdi]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ psubsw xmm1, xmm2
+ paddw xmm7, xmm1
+
+ pmaddwd xmm1, xmm1
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movq xmm2, QWORD PTR[rsi + rax * 2]
+ movq xmm3, QWORD PTR[rdi + rdx * 2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+
+ punpckhwd xmm7, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm6, xmm7
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddw xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddw xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd rdx, xmm7
+ movsx rcx, dx
+
+ mov dword ptr [rax], ecx
+ movd DWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2)
+sym(vp8_filter_block2d_bil_var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor xmm6, xmm6 ;
+ pxor xmm7, xmm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+
+ movq xmm3, QWORD PTR [rsi+1] ;
+ punpcklbw xmm1, xmm0 ;
+
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0
+ ;
+ pmullw xmm3, [rax+16] ;
+ paddw xmm1, xmm3 ;
+
+ paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movdqa xmm5, xmm1
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+filter_block2d_bil_var_sse2_loop:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, [xmm_bi_rd GLOBAL] ;
+
+ psraw xmm1, xmm_filter_shift ;
+ movdqa xmm3, xmm5 ;
+
+ movdqa xmm5, xmm1 ;
+ pmullw xmm3, [rdx] ;
+
+ pmullw xmm1, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+
+ paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_sse2_loop ;
+
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(7) ; sum
+ mov rdi, arg(8) ; sumsquared
+
+ movd [rsi], mm2 ; xsum
+ movd [rdi], mm4 ; xxsum
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+%else
+ add rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance16x_h_1:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm2, QWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance16x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+vp8_half_vert_variance16x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_vert_variance16x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+vp8_half_horiz_variance16x16_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance16x16_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+ times 8 dw 64
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
new file mode 100644
index 000000000..4a5b25b0d
--- /dev/null
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *vp7_filter
+);
+extern void filter_block1d_v6_mmx
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *vp7_filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp8_get4x4sse_cs_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern unsigned int vp8_get16x16pred_error_mmx
+(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride
+);
+
+
+void vp8_test_get_mb_ss(void)
+{
+ short zz[] =
+ {
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ };
+ int s = 0, x = vp8_get_mb_ss_mmx(zz);
+ {
+ int y;
+
+ for (y = 0; y < 256; y++)
+ s += (zz[y] * zz[y]);
+ }
+
+ x += 0;
+}
+
+
+unsigned int vp8_get16x16var_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned *SSE,
+ unsigned *SUM
+)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+
+ *SSE = var;
+ *SUM = avg;
+ return (var - ((avg * avg) >> 8));
+
+}
+
+
+
+
+
+unsigned int vp8_variance4x4_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+
+ return (var - ((avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ *sse = var;
+ return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
+{
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_i_sub_pixel_variance16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+ int f2soffset = (src_pixels_per_line >> 1);
+ int f2doffset = (dst_pixels_per_line >> 1);
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + f2soffset, src_pixels_per_line,
+ dst_ptr + f2doffset, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + f2soffset + 8, src_pixels_per_line,
+ dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+ int f2soffset = (src_pixels_per_line >> 1);
+ int f2doffset = (dst_pixels_per_line >> 1);
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + f2soffset, src_pixels_per_line,
+ dst_ptr + f2doffset, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
new file mode 100644
index 000000000..ea80753bd
--- /dev/null
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+ short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+unsigned int vp8_get16x16pred_error_sse2
+(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride
+);
+unsigned int vp8_get8x8var_sse2
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
+
+unsigned int vp8_variance4x4_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ return (var - ((avg * avg) >> 4));
+
+}
+
+
+
+unsigned int vp8_variance8x8_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+
+ return (var - ((avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0;
+ int sum0;
+
+
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return (sse0 - ((sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+
+ unsigned int sse0;
+ int sum0;
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
+{
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum, &xxsum
+ );
+
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ // note we could avoid these if statements if the calling function
+ // just called the appropriate functions inside.
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum1, &xxsum1
+ );
+ }
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum, &xxsum
+ );
+
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_i_sub_pixel_variance16x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
new file mode 100644
index 000000000..35fc90c48
--- /dev/null
+++ b/vp8/encoder/x86/variance_x86.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_X86_H
+#define VARIANCE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_sad(vp8_sad4x4_mmx);
+extern prototype_sad(vp8_sad8x8_mmx);
+extern prototype_sad(vp8_sad8x16_mmx);
+extern prototype_sad(vp8_sad16x8_mmx);
+extern prototype_sad(vp8_sad16x16_mmx);
+extern prototype_variance(vp8_variance4x4_mmx);
+extern prototype_variance(vp8_variance8x8_mmx);
+extern prototype_variance(vp8_variance8x16_mmx);
+extern prototype_variance(vp8_variance16x8_mmx);
+extern prototype_variance(vp8_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
+extern prototype_getmbss(vp8_get_mb_ss_mmx);
+extern prototype_variance(vp8_mse16x16_mmx);
+extern prototype_sad(vp8_get16x16pred_error_mmx);
+extern prototype_variance2(vp8_get8x8var_mmx);
+extern prototype_variance2(vp8_get16x16var_mmx);
+extern prototype_sad(vp8_get4x4sse_cs_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_mmx
+
+#undef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_mmx
+
+#undef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_mmx
+
+#undef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_mmx
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_mmx
+
+#undef vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_mmx
+
+#undef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_mmx
+
+#undef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_mmx
+
+#undef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_mmx
+
+#undef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_mmx
+
+#undef vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx
+
+#undef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx
+
+#undef vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx
+
+#undef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx
+
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
+
+#undef vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
+
+#undef vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_mmx
+
+#undef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_mmx
+
+#undef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
+
+#undef vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_mmx
+
+#undef vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_mmx
+
+#undef vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_sad(vp8_sad4x4_wmt);
+extern prototype_sad(vp8_sad8x8_wmt);
+extern prototype_sad(vp8_sad8x16_wmt);
+extern prototype_sad(vp8_sad16x8_wmt);
+extern prototype_sad(vp8_sad16x16_wmt);
+extern prototype_variance(vp8_variance4x4_wmt);
+extern prototype_variance(vp8_variance8x8_wmt);
+extern prototype_variance(vp8_variance8x16_wmt);
+extern prototype_variance(vp8_variance16x8_wmt);
+extern prototype_variance(vp8_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
+extern prototype_getmbss(vp8_get_mb_ss_sse2);
+extern prototype_variance(vp8_mse16x16_wmt);
+extern prototype_sad(vp8_get16x16pred_error_sse2);
+extern prototype_variance2(vp8_get8x8var_sse2);
+extern prototype_variance2(vp8_get16x16var_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_wmt
+
+#undef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_wmt
+
+#undef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_wmt
+
+#undef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_wmt
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_wmt
+
+#undef vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_wmt
+
+#undef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_wmt
+
+#undef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_wmt
+
+#undef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_wmt
+
+#undef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_wmt
+
+#undef vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt
+
+#undef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt
+
+#undef vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt
+
+#undef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt
+
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
+
+#undef vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
+
+#undef vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_sse2
+
+#undef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_wmt
+
+#undef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
+
+#undef vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_sse2
+
+#undef vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_sse2
+
+#endif
+#endif
+
+
+#if HAVE_SSE3
+extern prototype_sad(vp8_sad16x16_sse3);
+extern prototype_sad(vp8_sad16x8_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3);
+
+extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_sse3
+
+#undef vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3
+
+#undef vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3
+
+#undef vp8_variance_sad8x16x3
+#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3
+
+#undef vp8_variance_sad8x8x3
+#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3
+
+#undef vp8_variance_sad4x4x3
+#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
+
+#undef vp8_variance_sad16x16x4d
+#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3
+
+#undef vp8_variance_sad16x8x4d
+#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
+
+#undef vp8_variance_sad8x16x4d
+#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3
+
+#undef vp8_variance_sad8x8x4d
+#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3
+
+#undef vp8_variance_sad4x4x4d
+#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
+
+#endif
+#endif
+
+
+#if HAVE_SSSE3
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3
+
+#undef vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+
+#endif
+#endif
+
+#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
new file mode 100644
index 000000000..f1391ba8c
--- /dev/null
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+
+#if HAVE_MMX
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_mmx(input, output, pitch);
+ vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+ vp8_fast_fdct4x4_mmx(input, output , pitch);
+ vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ short *scan_mask, short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+ short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+
+ d->eob = vp8_fast_quantize_b_impl_mmx(
+ coeff_ptr,
+ zbin_ptr,
+ qcoeff_ptr,
+ dequant_ptr,
+ scan_mask,
+
+ round_ptr,
+ quant_ptr,
+ dqcoeff_ptr
+ );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSE2
+void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_wmt(input, output, pitch);
+ vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ short *scan_mask, short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+{
+ short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+
+ d->eob = vp8_fast_quantize_b_impl_sse(
+ coeff_ptr,
+ zbin_ptr,
+ qcoeff_ptr,
+ dequant_ptr,
+ scan_mask,
+
+ round_ptr,
+ quant_ptr,
+ dqcoeff_ptr
+ );
+}
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+#endif
+
+void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+ int flags = x86_simd_caps();
+ int mmx_enabled = flags & HAS_MMX;
+ int xmm_enabled = flags & HAS_SSE;
+ int wmt_enabled = flags & HAS_SSE2;
+ int SSE3Enabled = flags & HAS_SSE3;
+ int SSSE3Enabled = flags & HAS_SSSE3;
+
+ /* Note:
+ *
+ * This platform can be built without runtime CPU detection as well. If
+ * you modify any of the function mappings present in this file, be sure
+ * to also update them in static mapings (<arch>/filename_<arch>.h)
+ */
+
+ /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+
+ if (mmx_enabled)
+ {
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx;
+ cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
+ cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
+
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
+ cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx;
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;
+ }
+
+#endif
+#if HAVE_SSE2
+
+ if (wmt_enabled)
+ {
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt;
+ cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
+ /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
+
+#if 0
+ /* short SSE2 DCT currently disabled, does not match the MMX version */
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
+#endif
+ /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
+ /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
+ }
+
+#endif
+#if HAVE_SSE3
+
+ if (SSE3Enabled)
+ {
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
+ cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3;
+ cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3;
+ cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3;
+ cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3;
+ cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3;
+ cpi->rtcd.search.full_search = vp8_full_search_sadx3;
+
+ cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3;
+ cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3;
+ cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3;
+ cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3;
+ cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
+ cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
+ }
+
+#endif
+#if HAVE_SSSE3
+
+ if (SSSE3Enabled)
+ {
+ cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
+ cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+ }
+
+#endif
+#endif
+}