summaryrefslogtreecommitdiff
path: root/chromium/third_party/libvpx/source/libvpx
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-06-18 14:10:49 +0200
committerOswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>2015-06-18 13:53:24 +0000
commit813fbf95af77a531c57a8c497345ad2c61d475b3 (patch)
tree821b2c8de8365f21b6c9ba17a236fb3006a1d506 /chromium/third_party/libvpx/source/libvpx
parentaf6588f8d723931a298c995fa97259bb7f7deb55 (diff)
downloadqtwebengine-chromium-813fbf95af77a531c57a8c497345ad2c61d475b3.tar.gz
BASELINE: Update chromium to 44.0.2403.47
Change-Id: Ie056fedba95cf5e5c76b30c4b2c80fca4764aa2f Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx')
-rw-r--r--chromium/third_party/libvpx/source/libvpx/.mailmap20
-rw-r--r--chromium/third_party/libvpx/source/libvpx/AUTHORS29
-rw-r--r--chromium/third_party/libvpx/source/libvpx/CHANGELOG23
-rw-r--r--chromium/third_party/libvpx/source/libvpx/PATENTS2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/README15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/make/Android.mk69
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/make/Makefile30
-rw-r--r--[-rwxr-xr-x]chromium/third_party/libvpx/source/libvpx/build/make/configure.sh1918
-rwxr-xr-xchromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh42
-rwxr-xr-xchromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh33
-rwxr-xr-xchromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c857
-rwxr-xr-xchromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat15
-rwxr-xr-xchromium/third_party/libvpx/source/libvpx/configure49
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples.mk1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/postproc.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/resize_util.c25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/set_maps.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c954
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c21
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c91
-rw-r--r--chromium/third_party/libvpx/source/libvpx/libs.doxy_template14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/libs.mk75
-rw-r--r--chromium/third_party/libvpx/source/libvpx/mainpage.dox22
-rw-r--r--chromium/third_party/libvpx/source/libvpx/solution.mk3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h26
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h168
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h598
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h23
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc52
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc44
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc63
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc429
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc351
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc209
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc405
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc59
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc52
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc24
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc554
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc76
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc437
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc180
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc64
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc543
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc537
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc559
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc91
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc441
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc2645
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc3950
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc3519
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc191
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc118
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc48
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc764
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc421
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc288
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/usage.dox19
-rw-r--r--chromium/third_party/libvpx/source/libvpx/usage_cx.dox2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c184
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c35
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/common.h8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c32
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c39
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm47
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm1013
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm677
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm189
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c135
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm1253
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm59
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm175
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm277
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c165
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm375
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm865
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c57
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c22
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl111
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c302
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h37
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm93
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm146
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm410
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm960
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm353
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c22
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c65
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm310
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm317
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm352
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm471
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm225
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm272
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c41
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm258
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c89
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c64
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c45
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c26
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c36
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c60
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c483
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c343
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c160
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm153
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm205
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm51
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c45
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c80
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c93
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c131
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c390
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm)16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c357
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm)16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c145
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c92
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm69
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c59
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c1332
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c163
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c750
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c48
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c151
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c62
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c547
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm237
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c248
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm698
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c624
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c188
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c274
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm277
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c453
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm)257
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c58
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c473
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm)0
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c1045
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c880
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c856
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c335
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c300
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h157
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c948
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c1077
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h867
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c182
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c48
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c42
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c28
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c89
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c41
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c570
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h72
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c627
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c394
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c118
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h175
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c141
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c64
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h28
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c379
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h37
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c210
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl870
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c436
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h57
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c255
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c1820
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c762
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm310
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c111
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm287
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c108
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm52
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c789
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c219
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c234
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h45
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c45
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c364
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h74
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c49
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c155
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c117
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c505
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h59
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c92
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c177
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c255
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c138
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c42
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c360
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h61
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c154
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c2491
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c179
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c2130
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h196
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c179
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c20
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c465
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c606
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c56
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c444
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c1628
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c223
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c184
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c347
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c142
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c1086
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c38
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c104
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h35
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c314
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c247
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h52
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c57
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c35
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c48
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c627
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h76
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c20
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c381
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c912
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c1022
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c1375
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h464
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c469
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm73
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c331
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm46
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c71
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c180
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm1055
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm313
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c580
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c201
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm93
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm370
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c28
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk90
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c246
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c678
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk35
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/exports_enc1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c73
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h39
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h413
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h46
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c226
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm)5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c)112
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c)105
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl395
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm289
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm365
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c)8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_sse2.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm)6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c)8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_mmx.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_mmx.asm)30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse2.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm)12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm)34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm)30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm (renamed from chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_ssse3.asm)180
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h64
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h179
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c58
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c405
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c53
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c49
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c57
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c114
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c103
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h228
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h1152
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h155
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h120
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h159
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c586
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h131
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c740
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c29
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h55
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c56
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c525
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpxdec.c54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpxenc.c58
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpxstats.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/webmdec.cc10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/webmdec.h2
458 files changed, 49796 insertions, 42047 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/.mailmap b/chromium/third_party/libvpx/source/libvpx/.mailmap
index fb82a24e345..0bfda120f98 100644
--- a/chromium/third_party/libvpx/source/libvpx/.mailmap
+++ b/chromium/third_party/libvpx/source/libvpx/.mailmap
@@ -1,18 +1,26 @@
Adrian Grange <agrange@google.com>
+Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
Hangyu Kuang <hkuang@google.com>
Jim Bankoski <jimbankoski@google.com>
-John Koleszar <jkoleszar@google.com>
Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Sami Pietilä <samipietila@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
Tom Finegan <tomfinegan@google.com>
-Ralph Giles <giles@xiph.org> <giles@entropywave.com>
-Ralph Giles <giles@xiph.org> <giles@mozilla.com>
-Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Deb Mukherjee <debargha@google.com>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
diff --git a/chromium/third_party/libvpx/source/libvpx/AUTHORS b/chromium/third_party/libvpx/source/libvpx/AUTHORS
index a9aa4810634..2f63d7c5afb 100644
--- a/chromium/third_party/libvpx/source/libvpx/AUTHORS
+++ b/chromium/third_party/libvpx/source/libvpx/AUTHORS
@@ -3,10 +3,11 @@
Aaron Watry <awatry@gmail.com>
Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
Adrian Grange <agrange@google.com>
Ahmad Sharif <asharif@google.com>
Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <alex.converse@gmail.com>
+Alex Converse <aconverse@google.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Alpha Lam <hclam@google.com>
@@ -14,44 +15,58 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
Ami Fischman <fischman@chromium.org>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
chm <chm@rock-chips.com>
Christian Duvivier <cduvivier@google.com>
Daniel Kang <ddkang@google.com>
Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
Dmitry Kovalev <dkovalev@google.com>
Dragan Mrdjan <dmrdjan@mips.com>
-Erik Niemeyer <erik.a.niemeyer@gmail.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
Fabio Pedretti <fabio.ped@libero.it>
Frank Galligan <fgalligan@google.com>
Fredrik Söderquist <fs@opera.com>
Fritz Koenig <frkoenig@google.com>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
Guillaume Martres <gmartres@google.com>
Guillermo Ballester Valor <gbvalor@gmail.com>
Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
Henrik Lundin <hlundin@google.com>
Hui Su <huisu@google.com>
Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+JackyChen <jackychen@google.com>
James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
Jan Kratochvil <jan.kratochvil@redhat.com>
Janne Salonen <jsalonen@google.com>
Jeff Faust <jfaust@google.com>
Jeff Muizelaar <jmuizelaar@mozilla.com>
Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
Jim Bankoski <jimbankoski@google.com>
Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
+John Stark <jhnstrk@gmail.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Joshua Litt <joshualitt@google.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
Lou Quillio <louquillio@google.com>
Luca Barbato <lu_zero@gentoo.org>
Makoto Kato <makoto.kt@gmail.com>
@@ -65,6 +80,7 @@ Michael Kohler <michaelkohler@live.com>
Mike Frysinger <vapier@chromium.org>
Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
Morton Jonuschat <yabawock@gmail.com>
Parag Salasakar <img.mips1@gmail.com>
Pascal Massimino <pascal.massimino@gmail.com>
@@ -72,6 +88,8 @@ Patrik Westin <patrik.westin@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
Philip Jägenstedt <philipj@opera.com>
Priit Laes <plaes@plaes.org>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -79,22 +97,29 @@ Rafaël Carré <funman@videolan.org>
Ralph Giles <giles@xiph.org>
Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rbultje@google.com>
+Rui Ueyama <ruiu@google.com>
Sami Pietilä <samipietila@google.com>
Scott Graham <scottmg@chromium.org>
Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
Shimon Doodkin <helpmepro1@gmail.com>
Stefan Holmer <holmer@google.com>
Suman Sunkara <sunkaras@google.com>
Taekhyun Kim <takim@nvidia.com>
Takanori MATSUURA <t.matsuu@gmail.com>
Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
Tero Rintaluoma <teror@google.com>
Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
Timothy B. Terriberry <tterribe@xiph.org>
Tom Finegan <tomfinegan@google.com>
Vignesh Venkatasubramanian <vigneshv@google.com>
Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
Google Inc.
The Mozilla Foundation
The Xiph.Org Foundation
diff --git a/chromium/third_party/libvpx/source/libvpx/CHANGELOG b/chromium/third_party/libvpx/source/libvpx/CHANGELOG
index 97c9a7bd325..a318784150a 100644
--- a/chromium/third_party/libvpx/source/libvpx/CHANGELOG
+++ b/chromium/third_party/libvpx/source/libvpx/CHANGELOG
@@ -1,3 +1,26 @@
+2015-04-03 v1.4.0 "Indian Runner Duck"
+ This release includes significant improvements to the VP9 codec.
+
+ - Upgrading:
+ This release is ABI incompatible with 1.3.0. It drops the compatibility
+ layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+ controls for VP9.
+
+ - Enhancements:
+ Faster VP9 encoding and decoding
+ Multithreaded VP9 decoding (tile and frame-based)
+ Multithreaded VP9 encoding - on by default
+ YUV 4:2:2 and 4:4:4 support in VP9
+ 10 and 12bit support in VP9
+ 64bit ARM support by replacing ARM assembly with intrinsics
+
+ - Bug Fixes:
+ Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+ files.
+
+ - Known Issues:
+ Frame Parallel decoding fails for segmented and non-420 files.
+
2013-11-15 v1.3.0 "Forest"
This release introduces the VP9 codec in a backward-compatible way.
All existing users of VP8 can continue to use the library without
diff --git a/chromium/third_party/libvpx/source/libvpx/PATENTS b/chromium/third_party/libvpx/source/libvpx/PATENTS
index 79d17d7d6a9..caedf607e95 100644
--- a/chromium/third_party/libvpx/source/libvpx/PATENTS
+++ b/chromium/third_party/libvpx/source/libvpx/PATENTS
@@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
enforcement activity against any entity (including a cross-claim or
counterclaim in a lawsuit) alleging that any of these implementations of WebM
or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
patent infringement, then any patent rights granted to you under this License
for these implementations of WebM shall terminate as of the date such
litigation is filed.
diff --git a/chromium/third_party/libvpx/source/libvpx/README b/chromium/third_party/libvpx/source/libvpx/README
index 6f864d8591a..fcd1c2e18cf 100644
--- a/chromium/third_party/libvpx/source/libvpx/README
+++ b/chromium/third_party/libvpx/source/libvpx/README
@@ -1,4 +1,4 @@
-README - 30 May 2014
+README - 23 March 2015
Welcome to the WebM VP8/VP9 Codec SDK!
@@ -47,10 +47,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
--help output of the configure script. As of this writing, the list of
available targets is:
- armv5te-android-gcc
- armv5te-linux-rvct
- armv5te-linux-gcc
- armv5te-none-rvct
armv6-darwin-gcc
armv6-linux-rvct
armv6-linux-gcc
@@ -66,12 +62,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
armv7s-darwin-gcc
mips32-linux-gcc
mips64-linux-gcc
- ppc32-darwin8-gcc
- ppc32-darwin9-gcc
- ppc32-linux-gcc
- ppc64-darwin8-gcc
- ppc64-darwin9-gcc
- ppc64-linux-gcc
sparc-solaris-gcc
x86-android-gcc
x86-darwin8-gcc
@@ -82,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
x86-darwin11-gcc
x86-darwin12-gcc
x86-darwin13-gcc
+ x86-darwin14-gcc
x86-iphonesimulator-gcc
x86-linux-gcc
x86-linux-icc
@@ -99,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
x86_64-darwin11-gcc
x86_64-darwin12-gcc
x86_64-darwin13-gcc
+ x86_64-darwin14-gcc
x86_64-iphonesimulator-gcc
x86_64-linux-gcc
x86_64-linux-icc
@@ -115,6 +107,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
universal-darwin11-gcc
universal-darwin12-gcc
universal-darwin13-gcc
+ universal-darwin14-gcc
generic-gnu
The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat b/chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat
deleted file mode 100644
index c0987bcf7bf..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat
+++ /dev/null
@@ -1,18 +0,0 @@
-REM Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-REM
-REM Use of this source code is governed by a BSD-style license
-REM that can be found in the LICENSE file in the root of the source
-REM tree. An additional intellectual property rights grant can be found
-REM in the file PATENTS. All contributing project authors may
-REM be found in the AUTHORS file in the root of the source tree.
-echo on
-
-REM Arguments:
-REM %1 - Relative path to the directory containing the vp8 and vpx_scale
-REM source directories.
-REM %2 - Path to obj_int_extract.exe.
-cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
-%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-
-cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vpx_scale/vpx_scale_asm_offsets.c"
-%2\obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
index 816334e040d..0add523f99d 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
@@ -43,7 +43,7 @@
# will remove any NEON dependency.
# To change to building armeabi, run ./libvpx/configure again, but with
-# --target=arm5te-android-gcc and modify the Application.mk file to
+# --target=armv6-android-gcc and modify the Application.mk file to
# set APP_ABI := armeabi
#
# Running ndk-build will build libvpx and include it in your project.
@@ -60,7 +60,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
include $(CONFIG_DIR)libs-armv7-android-gcc.mk
LOCAL_ARM_MODE := arm
else ifeq ($(TARGET_ARCH_ABI),armeabi)
- include $(CONFIG_DIR)libs-armv5te-android-gcc.mk
+ include $(CONFIG_DIR)libs-armv6-android-gcc.mk
LOCAL_ARM_MODE := arm
else ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
include $(CONFIG_DIR)libs-armv8-android-gcc.mk
@@ -91,51 +91,8 @@ LOCAL_CFLAGS := -O3
# like x86inc.asm and x86_abi_support.asm
LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
-# -----------------------------------------------------------------------------
-# Template : asm_offsets_template
-# Arguments : 1: assembly offsets file to be created
-# 2: c file to base assembly offsets on
-# Returns : None
-# Usage : $(eval $(call asm_offsets_template,<asmfile>, <srcfile>
-# Rationale : Create offsets at compile time using for structures that are
-# defined in c, but used in assembly functions.
-# -----------------------------------------------------------------------------
-define asm_offsets_template
-
-_SRC:=$(2)
-_OBJ:=$(ASM_CNV_PATH)/$$(notdir $(2)).S
-
-_FLAGS = $$($$(my)CFLAGS) \
- $$(call get-src-file-target-cflags,$(2)) \
- $$(call host-c-includes,$$(LOCAL_C_INCLUDES) $$(CONFIG_DIR)) \
- $$(LOCAL_CFLAGS) \
- $$(NDK_APP_CFLAGS) \
- $$(call host-c-includes,$$($(my)C_INCLUDES)) \
- -DINLINE_ASM \
- -S \
-
-_TEXT = "Compile $$(call get-src-file-text,$(2))"
-_CC = $$(TARGET_CC)
-
-$$(eval $$(call ev-build-file))
-
-$(1) : $$(_OBJ) $(2)
- @mkdir -p $$(dir $$@)
- @grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)$(ASM_CONVERSION) > $$@
-endef
-
-# Use ads2gas script to convert from RVCT format to GAS format. This
-# puts the processed file under $(ASM_CNV_PATH). Local clean rule
-# to handle removing these
-ifeq ($(CONFIG_VP8_ENCODER), yes)
- ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
-endif
-ifeq ($(HAVE_NEON_ASM), yes)
- ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm
-endif
-
.PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
+$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
@mkdir -p $(dir $@)
@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
@@ -201,8 +158,6 @@ LOCAL_CFLAGS += \
LOCAL_MODULE := libvpx
-LOCAL_LDLIBS := -llog
-
ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
LOCAL_STATIC_LIBRARIES := cpufeatures
endif
@@ -215,6 +170,7 @@ ifeq ($(CONFIG_VP9), yes)
$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp9_rtcd.h
endif
$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_dsp_rtcd.h
ifeq ($(TARGET_ARCH_ABI),x86)
$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm
@@ -224,22 +180,13 @@ endif
clean:
@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
- @$(RM) $(patsubst %.asm, %.*, $(ASM_CNV_OFFSETS_DEPEND))
@$(RM) -r $(ASM_CNV_PATH)
@$(RM) $(CLEAN-OBJS)
-include $(BUILD_SHARED_LIBRARY)
-
-ifeq ($(HAVE_NEON), yes)
- $(eval $(call asm_offsets_template,\
- $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm, \
- $(LIBVPX_PATH)/vpx_scale/vpx_scale_asm_offsets.c))
-endif
-
-ifeq ($(CONFIG_VP8_ENCODER), yes)
- $(eval $(call asm_offsets_template,\
- $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm, \
- $(LIBVPX_PATH)/vp8/encoder/vp8_asm_enc_offsets.c))
+ifeq ($(ENABLE_SHARED),1)
+ include $(BUILD_SHARED_LIBRARY)
+else
+ include $(BUILD_STATIC_LIBRARY)
endif
ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
index ed90397f0ff..fc7749a5519 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
@@ -146,6 +146,7 @@ $(BUILD_PFX)%.c.d: %.c
$(BUILD_PFX)%.c.o: %.c
$(if $(quiet),@echo " [CC] $@")
+ $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<
$(BUILD_PFX)%.cc.d: %.cc
@@ -155,6 +156,7 @@ $(BUILD_PFX)%.cc.d: %.cc
$(BUILD_PFX)%.cc.o: %.cc
$(if $(quiet),@echo " [CXX] $@")
+ $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
$(BUILD_PFX)%.cpp.d: %.cpp
@@ -164,6 +166,7 @@ $(BUILD_PFX)%.cpp.d: %.cpp
$(BUILD_PFX)%.cpp.o: %.cpp
$(if $(quiet),@echo " [CXX] $@")
+ $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
$(BUILD_PFX)%.asm.d: %.asm
@@ -174,6 +177,7 @@ $(BUILD_PFX)%.asm.d: %.asm
$(BUILD_PFX)%.asm.o: %.asm
$(if $(quiet),@echo " [AS] $@")
+ $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(AS) $(ASFLAGS) -o $@ $<
$(BUILD_PFX)%.s.d: %.s
@@ -184,12 +188,14 @@ $(BUILD_PFX)%.s.d: %.s
$(BUILD_PFX)%.s.o: %.s
$(if $(quiet),@echo " [AS] $@")
+ $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(AS) $(ASFLAGS) -o $@ $<
.PRECIOUS: %.c.S
%.c.S: CFLAGS += -DINLINE_ASM
$(BUILD_PFX)%.c.S: %.c
$(if $(quiet),@echo " [GEN] $@")
+ $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
.PRECIOUS: %.asm.s
@@ -217,14 +223,6 @@ else
endif
#
-# Rule to extract assembly constants from C sources
-#
-obj_int_extract: build/make/obj_int_extract.c
- $(if $(quiet),@echo " [HOSTCC] $@")
- $(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
-CLEAN-OBJS += obj_int_extract
-
-#
# Utility functions
#
pairmap=$(if $(strip $(2)),\
@@ -340,9 +338,11 @@ endif
skip_deps := $(filter %clean,$(MAKECMDGOALS))
skip_deps += $(findstring testdata,$(MAKECMDGOALS))
ifeq ($(strip $(skip_deps)),)
- # Older versions of make don't like -include directives with no arguments
- ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
- -include $(filter %.d,$(OBJS-yes:.o=.d))
+ ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
+ # Older versions of make don't like -include directives with no arguments
+ ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
+ -include $(filter %.d,$(OBJS-yes:.o=.d))
+ endif
endif
endif
@@ -383,8 +383,8 @@ LIBS=$(call enabled,LIBS)
.libs: $(LIBS)
@touch $@
$(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
ifeq ($(MAKECMDGOALS),dist)
@@ -424,11 +424,7 @@ ifneq ($(call enabled,DIST-SRCS),)
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_sln.sh
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_vcxproj.sh
DIST-SRCS-$(CONFIG_MSVS) += build/make/msvs_common.sh
- DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/obj_int_extract.bat
- DIST-SRCS-$(CONFIG_MSVS) += build/arm-msvs/obj_int_extract.bat
DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
- # Include obj_int_extract if we use offsets from *_asm_*_offsets
- DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64) += build/make/obj_int_extract.c
DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas.pl
DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas_apple.pl
DIST-SRCS-$(ARCH_ARM) += build/make/ads2armasm_ms.pl
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
index 56e9f4406e8..68cc8bb4a5d 100755..100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
@@ -14,62 +14,56 @@
# Logging / Output Functions
#
die_unknown(){
- echo "Unknown option \"$1\"."
- echo "See $0 --help for available options."
- clean_temp_files
- exit 1
+ echo "Unknown option \"$1\"."
+ echo "See $0 --help for available options."
+ clean_temp_files
+ exit 1
}
-
die() {
- echo "$@"
- echo
- echo "Configuration failed. This could reflect a misconfiguration of your"
- echo "toolchains, improper options selected, or another problem. If you"
- echo "don't see any useful error messages above, the next step is to look"
- echo "at the configure error log file ($logfile) to determine what"
- echo "configure was trying to do when it died."
- clean_temp_files
- exit 1
+ echo "$@"
+ echo
+ echo "Configuration failed. This could reflect a misconfiguration of your"
+ echo "toolchains, improper options selected, or another problem. If you"
+ echo "don't see any useful error messages above, the next step is to look"
+ echo "at the configure error log file ($logfile) to determine what"
+ echo "configure was trying to do when it died."
+ clean_temp_files
+ exit 1
}
-
log(){
- echo "$@" >>$logfile
+ echo "$@" >>$logfile
}
-
log_file(){
- log BEGIN $1
- cat -n $1 >>$logfile
- log END $1
+ log BEGIN $1
+ cat -n $1 >>$logfile
+ log END $1
}
-
log_echo() {
- echo "$@"
- log "$@"
+ echo "$@"
+ log "$@"
}
-
fwrite () {
- outfile=$1
- shift
- echo "$@" >> ${outfile}
+ outfile=$1
+ shift
+ echo "$@" >> ${outfile}
}
-
show_help_pre(){
- for opt in ${CMDLINE_SELECT}; do
- opt2=`echo $opt | sed -e 's;_;-;g'`
- if enabled $opt; then
- eval "toggle_${opt}=\"--disable-${opt2}\""
- else
- eval "toggle_${opt}=\"--enable-${opt2} \""
- fi
- done
+ for opt in ${CMDLINE_SELECT}; do
+ opt2=`echo $opt | sed -e 's;_;-;g'`
+ if enabled $opt; then
+ eval "toggle_${opt}=\"--disable-${opt2}\""
+ else
+ eval "toggle_${opt}=\"--enable-${opt2} \""
+ fi
+ done
- cat <<EOF
+ cat <<EOF
Usage: configure [options]
Options:
@@ -89,6 +83,8 @@ Build options:
${toggle_gprof} enable/disable gprof profiling instrumentation
${toggle_gcov} enable/disable gcov coverage instrumentation
${toggle_thumb} enable/disable building arm assembly in thumb mode
+ ${toggle_dependency_tracking}
+ disable to speed up one-time build
Install options:
${toggle_install_docs} control whether docs are installed
@@ -100,9 +96,8 @@ Install options:
EOF
}
-
show_help_post(){
- cat <<EOF
+ cat <<EOF
NOTES:
@@ -119,150 +114,137 @@ EOF
exit 1
}
-
show_targets() {
- while [ -n "$*" ]; do
- if [ "${1%%-*}" = "${2%%-*}" ]; then
- if [ "${2%%-*}" = "${3%%-*}" ]; then
- printf " %-24s %-24s %-24s\n" "$1" "$2" "$3"
- shift; shift; shift
- else
- printf " %-24s %-24s\n" "$1" "$2"
- shift; shift
- fi
- else
- printf " %-24s\n" "$1"
- shift
- fi
- done
+ while [ -n "$*" ]; do
+ if [ "${1%%-*}" = "${2%%-*}" ]; then
+ if [ "${2%%-*}" = "${3%%-*}" ]; then
+ printf " %-24s %-24s %-24s\n" "$1" "$2" "$3"
+ shift; shift; shift
+ else
+ printf " %-24s %-24s\n" "$1" "$2"
+ shift; shift
+ fi
+ else
+ printf " %-24s\n" "$1"
+ shift
+ fi
+ done
}
-
show_help() {
- show_help_pre
- show_help_post
+ show_help_pre
+ show_help_post
}
#
# List Processing Functions
#
set_all(){
- value=$1
- shift
- for var in $*; do
- eval $var=$value
- done
+ value=$1
+ shift
+ for var in $*; do
+ eval $var=$value
+ done
}
-
is_in(){
- value=$1
- shift
- for var in $*; do
- [ $var = $value ] && return 0
- done
- return 1
+ value=$1
+ shift
+ for var in $*; do
+ [ $var = $value ] && return 0
+ done
+ return 1
}
-
add_cflags() {
- CFLAGS="${CFLAGS} $@"
- CXXFLAGS="${CXXFLAGS} $@"
+ CFLAGS="${CFLAGS} $@"
+ CXXFLAGS="${CXXFLAGS} $@"
}
-
add_cflags_only() {
- CFLAGS="${CFLAGS} $@"
+ CFLAGS="${CFLAGS} $@"
}
-
add_cxxflags_only() {
- CXXFLAGS="${CXXFLAGS} $@"
+ CXXFLAGS="${CXXFLAGS} $@"
}
-
add_ldflags() {
- LDFLAGS="${LDFLAGS} $@"
+ LDFLAGS="${LDFLAGS} $@"
}
-
add_asflags() {
- ASFLAGS="${ASFLAGS} $@"
+ ASFLAGS="${ASFLAGS} $@"
}
-
add_extralibs() {
- extralibs="${extralibs} $@"
+ extralibs="${extralibs} $@"
}
#
# Boolean Manipulation Functions
#
enable_feature(){
- set_all yes $*
+ set_all yes $*
}
disable_feature(){
- set_all no $*
+ set_all no $*
}
enabled(){
- eval test "x\$$1" = "xyes"
+ eval test "x\$$1" = "xyes"
}
disabled(){
- eval test "x\$$1" = "xno"
+ eval test "x\$$1" = "xno"
}
-
soft_enable() {
- for var in $*; do
- if ! disabled $var; then
- log_echo " enabling $var"
- enable_feature $var
- fi
- done
+ for var in $*; do
+ if ! disabled $var; then
+ enabled $var || log_echo " enabling $var"
+ enable_feature $var
+ fi
+ done
}
soft_disable() {
- for var in $*; do
- if ! enabled $var; then
- log_echo " disabling $var"
- disable_feature $var
- fi
- done
+ for var in $*; do
+ if ! enabled $var; then
+ disabled $var || log_echo " disabling $var"
+ disable_feature $var
+ fi
+ done
}
-
#
# Text Processing Functions
#
toupper(){
- echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
+ echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
}
-
tolower(){
- echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
+ echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
}
-
#
# Temporary File Functions
#
source_path=${0%/*}
enable_feature source_path_used
if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
- source_path="`pwd`"
- disable_feature source_path_used
+ source_path="`pwd`"
+ disable_feature source_path_used
fi
if test ! -z "$TMPDIR" ; then
- TMPDIRx="${TMPDIR}"
+ TMPDIRx="${TMPDIR}"
elif test ! -z "$TEMPDIR" ; then
- TMPDIRx="${TEMPDIR}"
+ TMPDIRx="${TEMPDIR}"
else
- TMPDIRx="/tmp"
+ TMPDIRx="/tmp"
fi
RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
@@ -273,76 +255,77 @@ TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
clean_temp_files() {
- rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
- enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
+ rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
+ enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
}
#
# Toolchain Check Functions
#
check_cmd() {
- enabled external_build && return
- log "$@"
- "$@" >>${logfile} 2>&1
+ enabled external_build && return
+ log "$@"
+ "$@" >>${logfile} 2>&1
}
check_cc() {
- log check_cc "$@"
- cat >${TMP_C}
- log_file ${TMP_C}
- check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
+ log check_cc "$@"
+ cat >${TMP_C}
+ log_file ${TMP_C}
+ check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
}
check_cxx() {
- log check_cxx "$@"
- cat >${TMP_CC}
- log_file ${TMP_CC}
- check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
+ log check_cxx "$@"
+ cat >${TMP_CC}
+ log_file ${TMP_CC}
+ check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
}
check_cpp() {
- log check_cpp "$@"
- cat > ${TMP_C}
- log_file ${TMP_C}
- check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
+ log check_cpp "$@"
+ cat > ${TMP_C}
+ log_file ${TMP_C}
+ check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
}
check_ld() {
- log check_ld "$@"
- check_cc $@ \
- && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
+ log check_ld "$@"
+ check_cc $@ \
+ && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
}
check_header(){
- log check_header "$@"
- header=$1
- shift
- var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
- disable_feature $var
- check_cpp "$@" <<EOF && enable_feature $var
+ log check_header "$@"
+ header=$1
+ shift
+ var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
+ disable_feature $var
+ check_cpp "$@" <<EOF && enable_feature $var
#include "$header"
int x;
EOF
}
-
check_cflags() {
- log check_cflags "$@"
- check_cc -Werror "$@" <<EOF
+ log check_cflags "$@"
+ check_cc -Werror "$@" <<EOF
int x;
EOF
}
check_cxxflags() {
- log check_cxxflags "$@"
+ log check_cxxflags "$@"
- # Catch CFLAGS that trigger CXX warnings
- case "$CXX" in
- *c++-analyzer|*clang++|*g++*) check_cxx -Werror "$@" <<EOF
+ # Catch CFLAGS that trigger CXX warnings
+ case "$CXX" in
+ *c++-analyzer|*clang++|*g++*)
+ check_cxx -Werror "$@" <<EOF
int x;
EOF
;;
- *) check_cxx -Werror "$@" <<EOF
+ *)
+ check_cxx -Werror "$@" <<EOF
int x;
EOF
;;
@@ -350,82 +333,82 @@ EOF
}
check_add_cflags() {
- check_cxxflags "$@" && add_cxxflags_only "$@"
- check_cflags "$@" && add_cflags_only "$@"
+ check_cxxflags "$@" && add_cxxflags_only "$@"
+ check_cflags "$@" && add_cflags_only "$@"
}
check_add_asflags() {
- log add_asflags "$@"
- add_asflags "$@"
+ log add_asflags "$@"
+ add_asflags "$@"
}
check_add_ldflags() {
- log add_ldflags "$@"
- add_ldflags "$@"
+ log add_ldflags "$@"
+ add_ldflags "$@"
}
check_asm_align() {
- log check_asm_align "$@"
- cat >${TMP_ASM} <<EOF
+ log check_asm_align "$@"
+ cat >${TMP_ASM} <<EOF
section .rodata
align 16
EOF
- log_file ${TMP_ASM}
- check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
- readelf -WS ${TMP_O} >${TMP_X}
- log_file ${TMP_X}
- if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
- die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
- fi
+ log_file ${TMP_ASM}
+ check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
+ readelf -WS ${TMP_O} >${TMP_X}
+ log_file ${TMP_X}
+ if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
+ die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
+ fi
}
# tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
check_gcc_machine_option() {
- opt="$1"
- feature="$2"
- [ -n "$feature" ] || feature="$opt"
-
- if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
- RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
- else
- soft_enable "$feature"
- fi
+ opt="$1"
+ feature="$2"
+ [ -n "$feature" ] || feature="$opt"
+
+ if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+ else
+ soft_enable "$feature"
+ fi
}
write_common_config_banner() {
- print_webm_license config.mk "##" ""
- echo '# This file automatically generated by configure. Do not edit!' >> config.mk
- echo "TOOLCHAIN := ${toolchain}" >> config.mk
+ print_webm_license config.mk "##" ""
+ echo '# This file automatically generated by configure. Do not edit!' >> config.mk
+ echo "TOOLCHAIN := ${toolchain}" >> config.mk
- case ${toolchain} in
- *-linux-rvct)
- echo "ALT_LIBC := ${alt_libc}" >> config.mk
- ;;
- esac
+ case ${toolchain} in
+ *-linux-rvct)
+ echo "ALT_LIBC := ${alt_libc}" >> config.mk
+ ;;
+ esac
}
write_common_config_targets() {
- for t in ${all_targets}; do
- if enabled ${t}; then
- if enabled universal || enabled child; then
- fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
- else
- fwrite config.mk "ALL_TARGETS += ${t}"
- fi
- fi
+ for t in ${all_targets}; do
+ if enabled ${t}; then
+ if enabled universal || enabled child; then
+ fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
+ else
+ fwrite config.mk "ALL_TARGETS += ${t}"
+ fi
+ fi
true;
- done
-true
+ done
+ true
}
write_common_target_config_mk() {
- saved_CC="${CC}"
- saved_CXX="${CXX}"
- enabled ccache && CC="ccache ${CC}"
- enabled ccache && CXX="ccache ${CXX}"
- print_webm_license $1 "##" ""
+ saved_CC="${CC}"
+ saved_CXX="${CXX}"
+ enabled ccache && CC="ccache ${CC}"
+ enabled ccache && CXX="ccache ${CXX}"
+ print_webm_license $1 "##" ""
- cat >> $1 << EOF
+ cat >> $1 << EOF
# This file automatically generated by configure. Do not edit!
SRC_PATH="$source_path"
SRC_PATH_BARE=$source_path
@@ -455,83 +438,87 @@ VCPROJ_SFX = ${VCPROJ_SFX}
RTCD_OPTIONS = ${RTCD_OPTIONS}
EOF
- if enabled rvct; then cat >> $1 << EOF
+ if enabled rvct; then cat >> $1 << EOF
fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
EOF
- else cat >> $1 << EOF
+ else cat >> $1 << EOF
fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
EOF
- fi
+ fi
- print_config_mk ARCH "${1}" ${ARCH_LIST}
- print_config_mk HAVE "${1}" ${HAVE_LIST}
- print_config_mk CONFIG "${1}" ${CONFIG_LIST}
- print_config_mk HAVE "${1}" gnu_strip
+ print_config_mk ARCH "${1}" ${ARCH_LIST}
+ print_config_mk HAVE "${1}" ${HAVE_LIST}
+ print_config_mk CONFIG "${1}" ${CONFIG_LIST}
+ print_config_mk HAVE "${1}" gnu_strip
- enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
+ enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
- CC="${saved_CC}"
- CXX="${saved_CXX}"
+ CC="${saved_CC}"
+ CXX="${saved_CXX}"
}
-
write_common_target_config_h() {
- print_webm_license ${TMP_H} "/*" " */"
- cat >> ${TMP_H} << EOF
+ print_webm_license ${TMP_H} "/*" " */"
+ cat >> ${TMP_H} << EOF
/* This file automatically generated by configure. Do not edit! */
#ifndef VPX_CONFIG_H
#define VPX_CONFIG_H
#define RESTRICT ${RESTRICT}
#define INLINE ${INLINE}
EOF
- print_config_h ARCH "${TMP_H}" ${ARCH_LIST}
- print_config_h HAVE "${TMP_H}" ${HAVE_LIST}
- print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
- print_config_vars_h "${TMP_H}" ${VAR_LIST}
- echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
- mkdir -p `dirname "$1"`
- cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
+ print_config_h ARCH "${TMP_H}" ${ARCH_LIST}
+ print_config_h HAVE "${TMP_H}" ${HAVE_LIST}
+ print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
+ print_config_vars_h "${TMP_H}" ${VAR_LIST}
+ echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
+ mkdir -p `dirname "$1"`
+ cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
}
process_common_cmdline() {
- for opt in "$@"; do
- optval="${opt#*=}"
- case "$opt" in
- --child) enable_feature child
+ for opt in "$@"; do
+ optval="${opt#*=}"
+ case "$opt" in
+ --child)
+ enable_feature child
;;
- --log*)
+ --log*)
logging="$optval"
if ! disabled logging ; then
- enabled logging || logfile="$logging"
+ enabled logging || logfile="$logging"
else
- logfile=/dev/null
+ logfile=/dev/null
fi
;;
- --target=*) toolchain="${toolchain:-${optval}}"
- ;;
- --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
+ --target=*)
+ toolchain="${toolchain:-${optval}}"
;;
- --cpu)
+ --force-target=*)
+ toolchain="${toolchain:-${optval}}"
+ enable_feature force_toolchain
;;
- --cpu=*) tune_cpu="$optval"
+ --cpu=*)
+ tune_cpu="$optval"
;;
- --extra-cflags=*)
+ --extra-cflags=*)
extra_cflags="${optval}"
;;
- --enable-?*|--disable-?*)
+ --enable-?*|--disable-?*)
eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
- [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
+ [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
elif [ $action = "disable" ] && ! disabled $option ; then
echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
die_unknown $opt
+ log_echo " disabling $option"
elif [ $action = "enable" ] && ! enabled $option ; then
echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
die_unknown $opt
+ log_echo " enabling $option"
fi
${action}_feature $option
;;
- --require-?*)
+ --require-?*)
eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
@@ -539,22 +526,22 @@ process_common_cmdline() {
die_unknown $opt
fi
;;
- --force-enable-?*|--force-disable-?*)
+ --force-enable-?*|--force-disable-?*)
eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
${action}_feature $option
;;
- --libc=*)
+ --libc=*)
[ -d "${optval}" ] || die "Not a directory: ${optval}"
disable_feature builtin_libc
alt_libc="${optval}"
;;
- --as=*)
+ --as=*)
[ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
- || [ "${optval}" = auto ] \
- || die "Must be yasm, nasm or auto: ${optval}"
+ || [ "${optval}" = auto ] \
+ || die "Must be yasm, nasm or auto: ${optval}"
alt_as="${optval}"
;;
- --size-limit=*)
+ --size-limit=*)
w="${optval%%x*}"
h="${optval##*x}"
VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
@@ -563,825 +550,832 @@ process_common_cmdline() {
|| die "Invalid size-limit: too big."
enable_feature size_limit
;;
- --prefix=*)
+ --prefix=*)
prefix="${optval}"
;;
- --libdir=*)
+ --libdir=*)
libdir="${optval}"
;;
- --sdk-path=*)
+ --sdk-path=*)
[ -d "${optval}" ] || die "Not a directory: ${optval}"
sdk_path="${optval}"
;;
- --libc|--as|--prefix|--libdir|--sdk-path)
+ --libc|--as|--prefix|--libdir|--sdk-path)
die "Option ${opt} requires argument"
;;
- --help|-h) show_help
+ --help|-h)
+ show_help
;;
- *) die_unknown $opt
+ *)
+ die_unknown $opt
;;
- esac
- done
+ esac
+ done
}
process_cmdline() {
- for opt do
- optval="${opt#*=}"
- case "$opt" in
- *) process_common_cmdline $opt
+ for opt do
+ optval="${opt#*=}"
+ case "$opt" in
+ *)
+ process_common_cmdline $opt
;;
- esac
- done
+ esac
+ done
}
-
post_process_common_cmdline() {
- prefix="${prefix:-/usr/local}"
- prefix="${prefix%/}"
- libdir="${libdir:-${prefix}/lib}"
- libdir="${libdir%/}"
- if [ "${libdir#${prefix}}" = "${libdir}" ]; then
- die "Libdir ${libdir} must be a subdirectory of ${prefix}"
- fi
+ prefix="${prefix:-/usr/local}"
+ prefix="${prefix%/}"
+ libdir="${libdir:-${prefix}/lib}"
+ libdir="${libdir%/}"
+ if [ "${libdir#${prefix}}" = "${libdir}" ]; then
+ die "Libdir ${libdir} must be a subdirectory of ${prefix}"
+ fi
}
-
post_process_cmdline() {
- true;
+ true;
}
setup_gnu_toolchain() {
- CC=${CC:-${CROSS}gcc}
- CXX=${CXX:-${CROSS}g++}
- AR=${AR:-${CROSS}ar}
- LD=${LD:-${CROSS}${link_with_cc:-ld}}
- AS=${AS:-${CROSS}as}
- STRIP=${STRIP:-${CROSS}strip}
- NM=${NM:-${CROSS}nm}
- AS_SFX=.s
- EXE_SFX=
+ CC=${CC:-${CROSS}gcc}
+ CXX=${CXX:-${CROSS}g++}
+ AR=${AR:-${CROSS}ar}
+ LD=${LD:-${CROSS}${link_with_cc:-ld}}
+ AS=${AS:-${CROSS}as}
+ STRIP=${STRIP:-${CROSS}strip}
+ NM=${NM:-${CROSS}nm}
+ AS_SFX=.s
+ EXE_SFX=
+}
+
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+ xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+ xcodebuild -sdk $1 -version Path 2>/dev/null
}
process_common_toolchain() {
- if [ -z "$toolchain" ]; then
- gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
- # detect tgt_isa
- case "$gcctarget" in
- armv6*)
- tgt_isa=armv6
- ;;
- armv7*-hardfloat*)
- tgt_isa=armv7
- float_abi=hard
- ;;
- armv7*)
- tgt_isa=armv7
- float_abi=softfp
- ;;
- armv5te*)
- tgt_isa=armv5te
- ;;
- *x86_64*|*amd64*)
- tgt_isa=x86_64
- ;;
- *i[3456]86*)
- tgt_isa=x86
- ;;
- *powerpc64*)
- tgt_isa=ppc64
- ;;
- *powerpc*)
- tgt_isa=ppc32
- ;;
- *sparc*)
- tgt_isa=sparc
- ;;
- esac
+ if [ -z "$toolchain" ]; then
+ gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
- # detect tgt_os
- case "$gcctarget" in
- *darwin8*)
- tgt_isa=universal
- tgt_os=darwin8
- ;;
- *darwin9*)
- tgt_isa=universal
- tgt_os=darwin9
- ;;
- *darwin10*)
- tgt_isa=x86_64
- tgt_os=darwin10
- ;;
- *darwin11*)
- tgt_isa=x86_64
- tgt_os=darwin11
- ;;
- *darwin12*)
- tgt_isa=x86_64
- tgt_os=darwin12
- ;;
- *darwin13*)
- tgt_isa=x86_64
- tgt_os=darwin13
- ;;
- x86_64*mingw32*)
- tgt_os=win64
- ;;
- *mingw32*|*cygwin*)
- [ -z "$tgt_isa" ] && tgt_isa=x86
- tgt_os=win32
- ;;
- *linux*|*bsd*)
- tgt_os=linux
- ;;
- *solaris2.10)
- tgt_os=solaris
- ;;
- *os2*)
- tgt_os=os2
- ;;
- esac
+ # detect tgt_isa
+ case "$gcctarget" in
+ armv6*)
+ tgt_isa=armv6
+ ;;
+ armv7*-hardfloat*)
+ tgt_isa=armv7
+ float_abi=hard
+ ;;
+ armv7*)
+ tgt_isa=armv7
+ float_abi=softfp
+ ;;
+ *x86_64*|*amd64*)
+ tgt_isa=x86_64
+ ;;
+ *i[3456]86*)
+ tgt_isa=x86
+ ;;
+ *sparc*)
+ tgt_isa=sparc
+ ;;
+ esac
- if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
- toolchain=${tgt_isa}-${tgt_os}-gcc
- fi
- fi
+ # detect tgt_os
+ case "$gcctarget" in
+ *darwin8*)
+ tgt_isa=universal
+ tgt_os=darwin8
+ ;;
+ *darwin9*)
+ tgt_isa=universal
+ tgt_os=darwin9
+ ;;
+ *darwin10*)
+ tgt_isa=x86_64
+ tgt_os=darwin10
+ ;;
+ *darwin11*)
+ tgt_isa=x86_64
+ tgt_os=darwin11
+ ;;
+ *darwin12*)
+ tgt_isa=x86_64
+ tgt_os=darwin12
+ ;;
+ *darwin13*)
+ tgt_isa=x86_64
+ tgt_os=darwin13
+ ;;
+ *darwin14*)
+ tgt_isa=x86_64
+ tgt_os=darwin14
+ ;;
+ x86_64*mingw32*)
+ tgt_os=win64
+ ;;
+ *mingw32*|*cygwin*)
+ [ -z "$tgt_isa" ] && tgt_isa=x86
+ tgt_os=win32
+ ;;
+ *linux*|*bsd*)
+ tgt_os=linux
+ ;;
+ *solaris2.10)
+ tgt_os=solaris
+ ;;
+ *os2*)
+ tgt_os=os2
+ ;;
+ esac
- toolchain=${toolchain:-generic-gnu}
+ if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
+ toolchain=${tgt_isa}-${tgt_os}-gcc
+ fi
+ fi
- is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
- || die "Unrecognized toolchain '${toolchain}'"
+ toolchain=${toolchain:-generic-gnu}
- enabled child || log_echo "Configuring for target '${toolchain}'"
+ is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
+ || die "Unrecognized toolchain '${toolchain}'"
- #
- # Set up toolchain variables
- #
- tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
- tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
- tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
+ enabled child || log_echo "Configuring for target '${toolchain}'"
- # Mark the specific ISA requested as enabled
- soft_enable ${tgt_isa}
- enable_feature ${tgt_os}
- enable_feature ${tgt_cc}
+ #
+ # Set up toolchain variables
+ #
+ tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
+ tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
+ tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
- # Enable the architecture family
- case ${tgt_isa} in
- arm*) enable_feature arm;;
- mips*) enable_feature mips;;
- esac
+ # Mark the specific ISA requested as enabled
+ soft_enable ${tgt_isa}
+ enable_feature ${tgt_os}
+ enable_feature ${tgt_cc}
- # PIC is probably what we want when building shared libs
- enabled shared && soft_enable pic
+ # Enable the architecture family
+ case ${tgt_isa} in
+ arm*)
+ enable_feature arm
+ ;;
+ mips*)
+ enable_feature mips
+ ;;
+ esac
- # Minimum iOS version for all target platforms (darwin and iphonesimulator).
- IOS_VERSION_MIN="6.0"
+ # PIC is probably what we want when building shared libs
+ enabled shared && soft_enable pic
- # Handle darwin variants. Newer SDKs allow targeting older
- # platforms, so find the newest SDK available.
- case ${toolchain} in
- *-darwin*)
- if [ -z "${DEVELOPER_DIR}" ]; then
- DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
- [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
- fi
- if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
- OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
- OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
- OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
- for v in ${OSX_SDK_VERSIONS}; do
- if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
- osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
- fi
- done
- fi
- ;;
- esac
+ # Minimum iOS version for all target platforms (darwin and iphonesimulator).
+ IOS_VERSION_MIN="6.0"
- if [ -d "${osx_sdk_dir}" ]; then
+ # Handle darwin variants. Newer SDKs allow targeting older
+ # platforms, so use the newest one available.
+ case ${toolchain} in
+ *-darwin*)
+ osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+ if [ -d "${osx_sdk_dir}" ]; then
add_cflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-isysroot ${osx_sdk_dir}"
- fi
+ fi
+ ;;
+ esac
- case ${toolchain} in
- *-darwin8-*)
- add_cflags "-mmacosx-version-min=10.4"
- add_ldflags "-mmacosx-version-min=10.4"
- ;;
- *-darwin9-*)
- add_cflags "-mmacosx-version-min=10.5"
- add_ldflags "-mmacosx-version-min=10.5"
- ;;
- *-darwin10-*)
- add_cflags "-mmacosx-version-min=10.6"
- add_ldflags "-mmacosx-version-min=10.6"
- ;;
- *-darwin11-*)
- add_cflags "-mmacosx-version-min=10.7"
- add_ldflags "-mmacosx-version-min=10.7"
- ;;
- *-darwin12-*)
- add_cflags "-mmacosx-version-min=10.8"
- add_ldflags "-mmacosx-version-min=10.8"
- ;;
- *-darwin13-*)
- add_cflags "-mmacosx-version-min=10.9"
- add_ldflags "-mmacosx-version-min=10.9"
- ;;
- *-iphonesimulator-*)
- add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
- add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
- osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
- add_cflags "-isysroot ${osx_sdk_dir}"
- add_ldflags "-isysroot ${osx_sdk_dir}"
- ;;
- esac
+ case ${toolchain} in
+ *-darwin8-*)
+ add_cflags "-mmacosx-version-min=10.4"
+ add_ldflags "-mmacosx-version-min=10.4"
+ ;;
+ *-darwin9-*)
+ add_cflags "-mmacosx-version-min=10.5"
+ add_ldflags "-mmacosx-version-min=10.5"
+ ;;
+ *-darwin10-*)
+ add_cflags "-mmacosx-version-min=10.6"
+ add_ldflags "-mmacosx-version-min=10.6"
+ ;;
+ *-darwin11-*)
+ add_cflags "-mmacosx-version-min=10.7"
+ add_ldflags "-mmacosx-version-min=10.7"
+ ;;
+ *-darwin12-*)
+ add_cflags "-mmacosx-version-min=10.8"
+ add_ldflags "-mmacosx-version-min=10.8"
+ ;;
+ *-darwin13-*)
+ add_cflags "-mmacosx-version-min=10.9"
+ add_ldflags "-mmacosx-version-min=10.9"
+ ;;
+ *-darwin14-*)
+ add_cflags "-mmacosx-version-min=10.10"
+ add_ldflags "-mmacosx-version-min=10.10"
+ ;;
+ *-iphonesimulator-*)
+ add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+ add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+ iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+ if [ -d "${iossim_sdk_dir}" ]; then
+ add_cflags "-isysroot ${iossim_sdk_dir}"
+ add_ldflags "-isysroot ${iossim_sdk_dir}"
+ fi
+ ;;
+ esac
- # Handle Solaris variants. Solaris 10 needs -lposix4
- case ${toolchain} in
- sparc-solaris-*)
- add_extralibs -lposix4
- disable_feature fast_unaligned
- ;;
- *-solaris-*)
- add_extralibs -lposix4
- ;;
- esac
+ # Handle Solaris variants. Solaris 10 needs -lposix4
+ case ${toolchain} in
+ sparc-solaris-*)
+ add_extralibs -lposix4
+ ;;
+ *-solaris-*)
+ add_extralibs -lposix4
+ ;;
+ esac
- # Process ARM architecture variants
- case ${toolchain} in
+ # Process ARM architecture variants
+ case ${toolchain} in
arm*)
- # on arm, isa versions are supersets
- case ${tgt_isa} in
+ # on arm, isa versions are supersets
+ case ${tgt_isa} in
arm64|armv8)
- soft_enable neon
- ;;
+ soft_enable neon
+ ;;
armv7|armv7s)
- soft_enable neon
- soft_enable neon_asm
- soft_enable media
- soft_enable edsp
- soft_enable fast_unaligned
- ;;
+ soft_enable neon
+ # Only enable neon_asm when neon is also enabled.
+ enabled neon && soft_enable neon_asm
+ # If someone tries to force it through, die.
+ if disabled neon && enabled neon_asm; then
+ die "Disabling neon while keeping neon-asm is not supported"
+ fi
+ soft_enable media
+ ;;
armv6)
- soft_enable media
- soft_enable edsp
- soft_enable fast_unaligned
- ;;
- armv5te)
- soft_enable edsp
- disable_feature fast_unaligned
- ;;
- esac
+ soft_enable media
+ ;;
+ esac
- asm_conversion_cmd="cat"
+ asm_conversion_cmd="cat"
- case ${tgt_cc} in
+ case ${tgt_cc} in
gcc)
- CROSS=${CROSS:-arm-none-linux-gnueabi-}
- link_with_cc=gcc
- setup_gnu_toolchain
- arch_int=${tgt_isa##armv}
- arch_int=${arch_int%%te}
- check_add_asflags --defsym ARCHITECTURE=${arch_int}
- tune_cflags="-mtune="
- if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
- if [ -z "${float_abi}" ]; then
- check_cpp <<EOF && float_abi=hard || float_abi=softfp
+ CROSS=${CROSS:-arm-none-linux-gnueabi-}
+ link_with_cc=gcc
+ setup_gnu_toolchain
+ arch_int=${tgt_isa##armv}
+ arch_int=${arch_int%%te}
+ check_add_asflags --defsym ARCHITECTURE=${arch_int}
+ tune_cflags="-mtune="
+ if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+ if [ -z "${float_abi}" ]; then
+ check_cpp <<EOF && float_abi=hard || float_abi=softfp
#ifndef __ARM_PCS_VFP
#error "not hardfp"
#endif
EOF
- fi
- check_add_cflags -march=armv7-a -mfloat-abi=${float_abi}
- check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
-
- if enabled neon || enabled neon_asm
- then
- check_add_cflags -mfpu=neon #-ftree-vectorize
- check_add_asflags -mfpu=neon
- fi
-
- if [ -z "${tune_cpu}" ]; then
- tune_cpu=cortex-a8
- fi
- else
- check_add_cflags -march=${tgt_isa}
- check_add_asflags -march=${tgt_isa}
fi
+ check_add_cflags -march=armv7-a -mfloat-abi=${float_abi}
+ check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
- enabled debug && add_asflags -g
- asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
- if enabled thumb; then
- asm_conversion_cmd="$asm_conversion_cmd -thumb"
- check_add_cflags -mthumb
- check_add_asflags -mthumb -mimplicit-it=always
+ if enabled neon || enabled neon_asm; then
+ check_add_cflags -mfpu=neon #-ftree-vectorize
+ check_add_asflags -mfpu=neon
fi
- ;;
+ else
+ check_add_cflags -march=${tgt_isa}
+ check_add_asflags -march=${tgt_isa}
+ fi
+
+ enabled debug && add_asflags -g
+ asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+ if enabled thumb; then
+ asm_conversion_cmd="$asm_conversion_cmd -thumb"
+ check_add_cflags -mthumb
+ check_add_asflags -mthumb -mimplicit-it=always
+ fi
+ ;;
vs*)
- asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
- AS_SFX=.s
- msvs_arch_dir=arm-msvs
- disable_feature multithread
- disable_feature unit_tests
- vs_version=${tgt_cc##vs}
- if [ $vs_version -ge 12 ]; then
- # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
- # only "AppContainerApplication" which requires an AppxManifest.
- # Therefore disable the examples, just build the library.
- disable_feature examples
- fi
- ;;
+ asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
+ AS_SFX=.s
+ msvs_arch_dir=arm-msvs
+ disable_feature multithread
+ disable_feature unit_tests
+ vs_version=${tgt_cc##vs}
+ if [ $vs_version -ge 12 ]; then
+ # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
+ # only "AppContainerApplication" which requires an AppxManifest.
+ # Therefore disable the examples, just build the library.
+ disable_feature examples
+ fi
+ ;;
rvct)
- CC=armcc
- AR=armar
- AS=armasm
- LD="${source_path}/build/make/armlink_adapter.sh"
- STRIP=arm-none-linux-gnueabi-strip
- NM=arm-none-linux-gnueabi-nm
- tune_cflags="--cpu="
- tune_asflags="--cpu="
- if [ -z "${tune_cpu}" ]; then
- if [ ${tgt_isa} = "armv7" ]; then
- if enabled neon || enabled neon_asm
- then
- check_add_cflags --fpu=softvfp+vfpv3
- check_add_asflags --fpu=softvfp+vfpv3
- fi
- check_add_cflags --cpu=Cortex-A8
- check_add_asflags --cpu=Cortex-A8
- else
- check_add_cflags --cpu=${tgt_isa##armv}
- check_add_asflags --cpu=${tgt_isa##armv}
- fi
+ CC=armcc
+ AR=armar
+ AS=armasm
+ LD="${source_path}/build/make/armlink_adapter.sh"
+ STRIP=arm-none-linux-gnueabi-strip
+ NM=arm-none-linux-gnueabi-nm
+ tune_cflags="--cpu="
+ tune_asflags="--cpu="
+ if [ -z "${tune_cpu}" ]; then
+ if [ ${tgt_isa} = "armv7" ]; then
+ if enabled neon || enabled neon_asm
+ then
+ check_add_cflags --fpu=softvfp+vfpv3
+ check_add_asflags --fpu=softvfp+vfpv3
+ fi
+ check_add_cflags --cpu=Cortex-A8
+ check_add_asflags --cpu=Cortex-A8
+ else
+ check_add_cflags --cpu=${tgt_isa##armv}
+ check_add_asflags --cpu=${tgt_isa##armv}
fi
- arch_int=${tgt_isa##armv}
- arch_int=${arch_int%%te}
- check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
- enabled debug && add_asflags -g
- add_cflags --gnu
- add_cflags --enum_is_int
- add_cflags --wchar32
- ;;
- esac
+ fi
+ arch_int=${tgt_isa##armv}
+ arch_int=${arch_int%%te}
+ check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+ enabled debug && add_asflags -g
+ add_cflags --gnu
+ add_cflags --enum_is_int
+ add_cflags --wchar32
+ ;;
+ esac
- case ${tgt_os} in
+ case ${tgt_os} in
none*)
- disable_feature multithread
- disable_feature os_support
- ;;
+ disable_feature multithread
+ disable_feature os_support
+ ;;
android*)
- SDK_PATH=${sdk_path}
- COMPILER_LOCATION=`find "${SDK_PATH}" \
- -name "arm-linux-androideabi-gcc*" -print -quit`
- TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
- CC=${TOOLCHAIN_PATH}gcc
- CXX=${TOOLCHAIN_PATH}g++
- AR=${TOOLCHAIN_PATH}ar
- LD=${TOOLCHAIN_PATH}gcc
- AS=${TOOLCHAIN_PATH}as
- STRIP=${TOOLCHAIN_PATH}strip
- NM=${TOOLCHAIN_PATH}nm
-
- if [ -z "${alt_libc}" ]; then
- alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
- awk '{n = split($0,a,"/"); \
- split(a[n-1],b,"-"); \
- print $0 " " b[2]}' | \
- sort -g -k 2 | \
- awk '{ print $1 }' | tail -1`
- fi
-
- add_cflags "--sysroot=${alt_libc}"
- add_ldflags "--sysroot=${alt_libc}"
-
- # linker flag that routes around a CPU bug in some
- # Cortex-A8 implementations (NDK Dev Guide)
- add_ldflags "-Wl,--fix-cortex-a8"
-
- enable_feature pic
- soft_enable realtime_only
- if [ ${tgt_isa} = "armv7" ]; then
- soft_enable runtime_cpu_detect
- fi
- if enabled runtime_cpu_detect; then
- add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
- fi
+ SDK_PATH=${sdk_path}
+ COMPILER_LOCATION=`find "${SDK_PATH}" \
+ -name "arm-linux-androideabi-gcc*" -print -quit`
+ TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
+ CC=${TOOLCHAIN_PATH}gcc
+ CXX=${TOOLCHAIN_PATH}g++
+ AR=${TOOLCHAIN_PATH}ar
+ LD=${TOOLCHAIN_PATH}gcc
+ AS=${TOOLCHAIN_PATH}as
+ STRIP=${TOOLCHAIN_PATH}strip
+ NM=${TOOLCHAIN_PATH}nm
+
+ if [ -z "${alt_libc}" ]; then
+ alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
+ awk '{n = split($0,a,"/"); \
+ split(a[n-1],b,"-"); \
+ print $0 " " b[2]}' | \
+ sort -g -k 2 | \
+ awk '{ print $1 }' | tail -1`
+ fi
+
+ add_cflags "--sysroot=${alt_libc}"
+ add_ldflags "--sysroot=${alt_libc}"
+
+ # linker flag that routes around a CPU bug in some
+ # Cortex-A8 implementations (NDK Dev Guide)
+ add_ldflags "-Wl,--fix-cortex-a8"
+
+ enable_feature pic
+ soft_enable realtime_only
+ if [ ${tgt_isa} = "armv7" ]; then
+ soft_enable runtime_cpu_detect
+ fi
+ if enabled runtime_cpu_detect; then
+ add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
+ fi
;;
darwin*)
- XCRUN_FIND="xcrun --sdk iphoneos -find"
- CXX="$(${XCRUN_FIND} clang++)"
- CC="$(${XCRUN_FIND} clang)"
- AR="$(${XCRUN_FIND} ar)"
- AS="$(${XCRUN_FIND} as)"
- STRIP="$(${XCRUN_FIND} strip)"
- NM="$(${XCRUN_FIND} nm)"
- RANLIB="$(${XCRUN_FIND} ranlib)"
- AS_SFX=.s
-
- # Special handling of ld for armv6 because libclang_rt.ios.a does
- # not contain armv6 support in Apple's clang package:
- # Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
- # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
- # renders support for armv6 unnecessary because the 3GS and up
- # support neon.
- if [ "${tgt_isa}" = "armv6" ]; then
- LD="$(${XCRUN_FIND} ld)"
- else
- LD="${CXX:-$(${XCRUN_FIND} ld)}"
- fi
-
- # ASFLAGS is written here instead of using check_add_asflags
- # because we need to overwrite all of ASFLAGS and purge the
- # options that were put in above
- ASFLAGS="-arch ${tgt_isa} -g"
-
- alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
- add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
- add_ldflags -arch ${tgt_isa}
-
- if [ "${LD}" = "${CXX}" ]; then
- add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
- else
- add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
- fi
-
- for d in lib usr/lib usr/lib/system; do
- try_dir="${alt_libc}/${d}"
- [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
- done
-
- asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
- ;;
+ XCRUN_FIND="xcrun --sdk iphoneos --find"
+ CXX="$(${XCRUN_FIND} clang++)"
+ CC="$(${XCRUN_FIND} clang)"
+ AR="$(${XCRUN_FIND} ar)"
+ AS="$(${XCRUN_FIND} as)"
+ STRIP="$(${XCRUN_FIND} strip)"
+ NM="$(${XCRUN_FIND} nm)"
+ RANLIB="$(${XCRUN_FIND} ranlib)"
+ AS_SFX=.s
+
+ # Special handling of ld for armv6 because libclang_rt.ios.a does
+ # not contain armv6 support in Apple's clang package:
+ # Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
+ # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
+ # renders support for armv6 unnecessary because the 3GS and up
+ # support neon.
+ if [ "${tgt_isa}" = "armv6" ]; then
+ LD="$(${XCRUN_FIND} ld)"
+ else
+ LD="${CXX:-$(${XCRUN_FIND} ld)}"
+ fi
+
+ # ASFLAGS is written here instead of using check_add_asflags
+ # because we need to overwrite all of ASFLAGS and purge the
+ # options that were put in above
+ ASFLAGS="-arch ${tgt_isa} -g"
+
+ add_cflags -arch ${tgt_isa}
+ add_ldflags -arch ${tgt_isa}
+
+ alt_libc="$(show_darwin_sdk_path iphoneos)"
+ if [ -d "${alt_libc}" ]; then
+ add_cflags -isysroot ${alt_libc}
+ fi
+
+ if [ "${LD}" = "${CXX}" ]; then
+ add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
+ else
+ add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
+ fi
+
+ for d in lib usr/lib usr/lib/system; do
+ try_dir="${alt_libc}/${d}"
+ [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
+ done
+
+ asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+ ;;
linux*)
- enable_feature linux
- if enabled rvct; then
- # Check if we have CodeSourcery GCC in PATH. Needed for
- # libraries
- hash arm-none-linux-gnueabi-gcc 2>&- || \
- die "Couldn't find CodeSourcery GCC from PATH"
-
- # Use armcc as a linker to enable translation of
- # some gcc specific options such as -lm and -lpthread.
- LD="armcc --translate_gcc"
-
- # create configuration file (uses path to CodeSourcery GCC)
- armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
-
- add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
- add_asflags --no_hide_all --apcs=/interwork
- add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
- enabled pic && add_cflags --apcs=/fpic
- enabled pic && add_asflags --apcs=/fpic
- enabled shared && add_cflags --shared
- fi
- ;;
-
- esac
- ;;
+ enable_feature linux
+ if enabled rvct; then
+ # Check if we have CodeSourcery GCC in PATH. Needed for
+ # libraries
+ hash arm-none-linux-gnueabi-gcc 2>&- || \
+ die "Couldn't find CodeSourcery GCC from PATH"
+
+ # Use armcc as a linker to enable translation of
+ # some gcc specific options such as -lm and -lpthread.
+ LD="armcc --translate_gcc"
+
+ # create configuration file (uses path to CodeSourcery GCC)
+ armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
+
+ add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+ add_asflags --no_hide_all --apcs=/interwork
+ add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+ enabled pic && add_cflags --apcs=/fpic
+ enabled pic && add_asflags --apcs=/fpic
+ enabled shared && add_cflags --shared
+ fi
+ ;;
+ esac
+ ;;
mips*)
- link_with_cc=gcc
- setup_gnu_toolchain
- tune_cflags="-mtune="
- if enabled dspr2; then
- check_add_cflags -mips32r2 -mdspr2
- disable_feature fast_unaligned
- fi
- check_add_cflags -march=${tgt_isa}
- check_add_asflags -march=${tgt_isa}
- check_add_asflags -KPIC
- ;;
- ppc*)
- enable_feature ppc
- bits=${tgt_isa##ppc}
- link_with_cc=gcc
- setup_gnu_toolchain
- add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
- soft_enable altivec
- enabled altivec && add_cflags -maltivec
-
- case "$tgt_os" in
- linux*)
- add_asflags -maltivec -mregnames -I"\$(dir \$<)linux"
- ;;
- darwin*)
- darwin_arch="-arch ppc"
- enabled ppc64 && darwin_arch="${darwin_arch}64"
- add_cflags ${darwin_arch} -m${bits} -fasm-blocks
- add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
- add_ldflags ${darwin_arch} -m${bits}
- enabled altivec && add_cflags -faltivec
- ;;
- esac
- ;;
- x86*)
- case ${tgt_os} in
- win*)
- enabled gcc && add_cflags -fno-common
- ;;
- solaris*)
- CC=${CC:-${CROSS}gcc}
- CXX=${CXX:-${CROSS}g++}
- LD=${LD:-${CROSS}gcc}
- CROSS=${CROSS:-g}
- ;;
- os2)
- AS=${AS:-nasm}
- ;;
- esac
-
- AS="${alt_as:-${AS:-auto}}"
- case ${tgt_cc} in
- icc*)
- CC=${CC:-icc}
- LD=${LD:-icc}
- setup_gnu_toolchain
- add_cflags -use-msasm # remove -use-msasm too?
- # add -no-intel-extensions to suppress warning #10237
- # refer to http://software.intel.com/en-us/forums/topic/280199
- add_ldflags -i-static -no-intel-extensions
- enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
- enabled x86_64 && AR=xiar
- case ${tune_cpu} in
- atom*)
- tune_cflags="-x"
- tune_cpu="SSE3_ATOM"
- ;;
- *)
- tune_cflags="-march="
- ;;
- esac
+ link_with_cc=gcc
+ setup_gnu_toolchain
+ tune_cflags="-mtune="
+ if enabled dspr2; then
+ check_add_cflags -mips32r2 -mdspr2
+ fi
+
+ if enabled runtime_cpu_detect; then
+ disable_feature runtime_cpu_detect
+ fi
+
+ if [ -n "${tune_cpu}" ]; then
+ case ${tune_cpu} in
+ p5600)
+ check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+ check_add_cflags -msched-weight -mhard-float -mfp64
+ check_add_asflags -mips32r5 -mhard-float -mfp64
+ check_add_ldflags -mfp64
;;
- gcc*)
- link_with_cc=gcc
- tune_cflags="-march="
- setup_gnu_toolchain
- #for 32 bit x86 builds, -O3 did not turn on this flag
- enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
- ;;
- vs*)
- # When building with Microsoft Visual Studio the assembler is
- # invoked directly. Checking at configure time is unnecessary.
- # Skip the check by setting AS arbitrarily
- AS=msvs
- msvs_arch_dir=x86-msvs
- vc_version=${tgt_cc##vs}
- case $vc_version in
- 7|8|9|10)
- echo "${tgt_cc} does not support avx/avx2, disabling....."
- RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
- soft_disable avx
- soft_disable avx2
- ;;
- esac
+ i6400)
+ check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight
+ check_add_cflags -mload-store-pairs -mhard-float -mfp64
+ check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
+ check_add_ldflags -mips64r6 -mabi=64 -mfp64
;;
esac
- bits=32
- enabled x86_64 && bits=64
- check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
+ if enabled msa; then
+ add_cflags -mmsa
+ add_asflags -mmsa
+ add_ldflags -mmsa
+ fi
+ fi
+
+ check_add_cflags -march=${tgt_isa}
+ check_add_asflags -march=${tgt_isa}
+ check_add_asflags -KPIC
+ ;;
+ x86*)
+ case ${tgt_os} in
+ win*)
+ enabled gcc && add_cflags -fno-common
+ ;;
+ solaris*)
+ CC=${CC:-${CROSS}gcc}
+ CXX=${CXX:-${CROSS}g++}
+ LD=${LD:-${CROSS}gcc}
+ CROSS=${CROSS:-g}
+ ;;
+ os2)
+ AS=${AS:-nasm}
+ ;;
+ esac
+
+ AS="${alt_as:-${AS:-auto}}"
+ case ${tgt_cc} in
+ icc*)
+ CC=${CC:-icc}
+ LD=${LD:-icc}
+ setup_gnu_toolchain
+ add_cflags -use-msasm # remove -use-msasm too?
+ # add -no-intel-extensions to suppress warning #10237
+ # refer to http://software.intel.com/en-us/forums/topic/280199
+ add_ldflags -i-static -no-intel-extensions
+ enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
+ enabled x86_64 && AR=xiar
+ case ${tune_cpu} in
+ atom*)
+ tune_cflags="-x"
+ tune_cpu="SSE3_ATOM"
+ ;;
+ *)
+ tune_cflags="-march="
+ ;;
+ esac
+ ;;
+ gcc*)
+ link_with_cc=gcc
+ tune_cflags="-march="
+ setup_gnu_toolchain
+ #for 32 bit x86 builds, -O3 did not turn on this flag
+ enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
+ ;;
+ vs*)
+ # When building with Microsoft Visual Studio the assembler is
+ # invoked directly. Checking at configure time is unnecessary.
+ # Skip the check by setting AS arbitrarily
+ AS=msvs
+ msvs_arch_dir=x86-msvs
+ vc_version=${tgt_cc##vs}
+ case $vc_version in
+ 7|8|9|10)
+ echo "${tgt_cc} does not support avx/avx2, disabling....."
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
+ soft_disable avx
+ soft_disable avx2
+ ;;
+ esac
+ ;;
+ esac
+
+ bits=32
+ enabled x86_64 && bits=64
+ check_cpp <<EOF && bits=x32
+#if !defined(__ILP32__) || !defined(__x86_64__)
#error "not x32"
#endif
EOF
- case ${tgt_cc} in
- gcc*)
- add_cflags -m${bits}
- add_ldflags -m${bits}
- ;;
- esac
-
- soft_enable runtime_cpu_detect
- # We can't use 'check_cflags' until the compiler is configured and CC is
- # populated.
- check_gcc_machine_option mmx
- check_gcc_machine_option sse
- check_gcc_machine_option sse2
- check_gcc_machine_option sse3
- check_gcc_machine_option ssse3
- check_gcc_machine_option sse4 sse4_1
- check_gcc_machine_option avx
- check_gcc_machine_option avx2
-
- case "${AS}" in
- auto|"")
- which nasm >/dev/null 2>&1 && AS=nasm
- which yasm >/dev/null 2>&1 && AS=yasm
- [ "${AS}" = auto ] || [ -z "${AS}" ] \
- && die "Neither yasm nor nasm have been found"
- ;;
- esac
- log_echo " using $AS"
- [ "${AS##*/}" = nasm ] && add_asflags -Ox
- AS_SFX=.asm
- case ${tgt_os} in
- win32)
- add_asflags -f win32
- enabled debug && add_asflags -g cv8
- EXE_SFX=.exe
- ;;
- win64)
- add_asflags -f x64
- enabled debug && add_asflags -g cv8
- EXE_SFX=.exe
- ;;
- linux*|solaris*|android*)
- add_asflags -f elf${bits}
- enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
- enabled debug && [ "${AS}" = nasm ] && add_asflags -g
- [ "${AS##*/}" = nasm ] && check_asm_align
- ;;
- darwin*)
- add_asflags -f macho${bits}
- enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
- add_cflags ${darwin_arch}
- add_ldflags ${darwin_arch}
- # -mdynamic-no-pic is still a bit of voodoo -- it was required at
- # one time, but does not seem to be now, and it breaks some of the
- # code that still relies on inline assembly.
- # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
- enabled icc && ! enabled pic && add_cflags -fno-pic
- ;;
- iphonesimulator)
- add_asflags -f macho${bits}
- enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
- add_cflags ${sim_arch}
- add_ldflags ${sim_arch}
- ;;
- os2)
- add_asflags -f aout
- enabled debug && add_asflags -g
- EXE_SFX=.exe
- ;;
- *) log "Warning: Unknown os $tgt_os while setting up $AS flags"
- ;;
- esac
- ;;
+ case ${tgt_cc} in
+ gcc*)
+ add_cflags -m${bits}
+ add_ldflags -m${bits}
+ ;;
+ esac
+
+ soft_enable runtime_cpu_detect
+ # We can't use 'check_cflags' until the compiler is configured and CC is
+ # populated.
+ check_gcc_machine_option mmx
+ check_gcc_machine_option sse
+ check_gcc_machine_option sse2
+ check_gcc_machine_option sse3
+ check_gcc_machine_option ssse3
+ check_gcc_machine_option sse4 sse4_1
+ check_gcc_machine_option avx
+ check_gcc_machine_option avx2
+
+ case "${AS}" in
+ auto|"")
+ which nasm >/dev/null 2>&1 && AS=nasm
+ which yasm >/dev/null 2>&1 && AS=yasm
+ if [ "${AS}" = nasm ] ; then
+ # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
+ # this check if they start shipping a compatible version.
+ apple=`nasm -v | grep "Apple"`
+ [ -n "${apple}" ] \
+ && echo "Unsupported version of nasm: ${apple}" \
+ && AS=""
+ fi
+ [ "${AS}" = auto ] || [ -z "${AS}" ] \
+ && die "Neither yasm nor nasm have been found"
+ ;;
+ esac
+ log_echo " using $AS"
+ [ "${AS##*/}" = nasm ] && add_asflags -Ox
+ AS_SFX=.asm
+ case ${tgt_os} in
+ win32)
+ add_asflags -f win32
+ enabled debug && add_asflags -g cv8
+ EXE_SFX=.exe
+ ;;
+ win64)
+ add_asflags -f x64
+ enabled debug && add_asflags -g cv8
+ EXE_SFX=.exe
+ ;;
+ linux*|solaris*|android*)
+ add_asflags -f elf${bits}
+ enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
+ enabled debug && [ "${AS}" = nasm ] && add_asflags -g
+ [ "${AS##*/}" = nasm ] && check_asm_align
+ ;;
+ darwin*)
+ add_asflags -f macho${bits}
+ enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
+ add_cflags ${darwin_arch}
+ add_ldflags ${darwin_arch}
+ # -mdynamic-no-pic is still a bit of voodoo -- it was required at
+ # one time, but does not seem to be now, and it breaks some of the
+ # code that still relies on inline assembly.
+ # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
+ enabled icc && ! enabled pic && add_cflags -fno-pic
+ ;;
+ iphonesimulator)
+ add_asflags -f macho${bits}
+ enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
+ add_cflags ${sim_arch}
+ add_ldflags ${sim_arch}
+ ;;
+ os2)
+ add_asflags -f aout
+ enabled debug && add_asflags -g
+ EXE_SFX=.exe
+ ;;
+ *)
+ log "Warning: Unknown os $tgt_os while setting up $AS flags"
+ ;;
+ esac
+ ;;
universal*|*-gcc|generic-gnu)
- link_with_cc=gcc
- enable_feature gcc
- setup_gnu_toolchain
- ;;
- esac
+ link_with_cc=gcc
+ enable_feature gcc
+ setup_gnu_toolchain
+ ;;
+ esac
- # Try to enable CPU specific tuning
- if [ -n "${tune_cpu}" ]; then
- if [ -n "${tune_cflags}" ]; then
- check_add_cflags ${tune_cflags}${tune_cpu} || \
- die "Requested CPU '${tune_cpu}' not supported by compiler"
- fi
+ # Try to enable CPU specific tuning
+ if [ -n "${tune_cpu}" ]; then
+ if [ -n "${tune_cflags}" ]; then
+ check_add_cflags ${tune_cflags}${tune_cpu} || \
+ die "Requested CPU '${tune_cpu}' not supported by compiler"
+ fi
if [ -n "${tune_asflags}" ]; then
- check_add_asflags ${tune_asflags}${tune_cpu} || \
- die "Requested CPU '${tune_cpu}' not supported by assembler"
- fi
+ check_add_asflags ${tune_asflags}${tune_cpu} || \
+ die "Requested CPU '${tune_cpu}' not supported by assembler"
+ fi
if [ -z "${tune_cflags}${tune_asflags}" ]; then
- log_echo "Warning: CPU tuning not supported by this toolchain"
- fi
+ log_echo "Warning: CPU tuning not supported by this toolchain"
fi
-
- if enabled debug; then
- check_add_cflags -g && check_add_ldflags -g
+ fi
+
+ if enabled debug; then
+ check_add_cflags -g && check_add_ldflags -g
+ else
+ check_add_cflags -DNDEBUG
+ fi
+
+ enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
+ enabled gcov &&
+ check_add_cflags -fprofile-arcs -ftest-coverage &&
+ check_add_ldflags -fprofile-arcs -ftest-coverage
+
+ if enabled optimizations; then
+ if enabled rvct; then
+ enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
else
- check_add_cflags -DNDEBUG
- fi
-
- enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
- enabled gcov &&
- check_add_cflags -fprofile-arcs -ftest-coverage &&
- check_add_ldflags -fprofile-arcs -ftest-coverage
-
- if enabled optimizations; then
- if enabled rvct; then
- enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
- else
- enabled small && check_add_cflags -O2 || check_add_cflags -O3
- fi
+ enabled small && check_add_cflags -O2 || check_add_cflags -O3
fi
+ fi
- tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
- # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
- # non-Darwin target.
- if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
- [ "${tgt_os_no_version}" != "darwin" ]; then
- soft_enable use_x86inc
- fi
+ if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
+ soft_enable use_x86inc
+ fi
- # Position Independent Code (PIC) support, for building relocatable
- # shared objects
- enabled gcc && enabled pic && check_add_cflags -fPIC
+ # Position Independent Code (PIC) support, for building relocatable
+ # shared objects
+ enabled gcc && enabled pic && check_add_cflags -fPIC
- # Work around longjmp interception on glibc >= 2.11, to improve binary
- # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
- enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
+ # Work around longjmp interception on glibc >= 2.11, to improve binary
+ # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
+ enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
- # Check for strip utility variant
- ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
+ # Check for strip utility variant
+ ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
- # Try to determine target endianness
- check_cc <<EOF
- unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
+ # Try to determine target endianness
+ check_cc <<EOF
+unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
EOF
[ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
# Try to find which inline keywords are supported
check_cc <<EOF && INLINE="inline"
- static inline function() {}
+static inline function() {}
EOF
- check_cc <<EOF && INLINE="__inline__ __attribute__((always_inline))"
- static __attribute__((always_inline)) function() {}
-EOF
-
- # Almost every platform uses pthreads.
- if enabled multithread; then
- case ${toolchain} in
- *-win*-vs*);;
- *-android-gcc);;
- *) check_header pthread.h && add_extralibs -lpthread
- esac
- fi
- # only for MIPS platforms
+ # Almost every platform uses pthreads.
+ if enabled multithread; then
case ${toolchain} in
- mips*)
- if enabled dspr2; then
- if enabled big_endian; then
- echo "dspr2 optimizations are available only for little endian platforms"
- disable_feature dspr2
- fi
- fi
+ *-win*-vs*)
+ ;;
+ *-android-gcc)
+ ;;
+ *)
+ check_header pthread.h && add_extralibs -lpthread
;;
esac
+ fi
- # glibc needs these
- if enabled linux; then
- add_cflags -D_LARGEFILE_SOURCE
- add_cflags -D_FILE_OFFSET_BITS=64
- fi
-
- # append any user defined extra cflags
- if [ -n "${extra_cflags}" ] ; then
- check_add_cflags ${extra_cflags} || \
- die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
- fi
+ # only for MIPS platforms
+ case ${toolchain} in
+ mips*)
+ if enabled big_endian; then
+ if enabled dspr2; then
+ echo "dspr2 optimizations are available only for little endian platforms"
+ disable_feature dspr2
+ fi
+ if enabled msa; then
+ echo "msa optimizations are available only for little endian platforms"
+ disable_feature msa
+ fi
+ fi
+ ;;
+ esac
+
+ # glibc needs these
+ if enabled linux; then
+ add_cflags -D_LARGEFILE_SOURCE
+ add_cflags -D_FILE_OFFSET_BITS=64
+ fi
+
+ # append any user defined extra cflags
+ if [ -n "${extra_cflags}" ] ; then
+ check_add_cflags ${extra_cflags} || \
+ die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+ fi
}
process_toolchain() {
- process_common_toolchain
+ process_common_toolchain
}
print_config_mk() {
- saved_prefix="${prefix}"
- prefix=$1
- makefile=$2
- shift 2
- for cfg; do
- if enabled $cfg; then
- upname="`toupper $cfg`"
- echo "${prefix}_${upname}=yes" >> $makefile
- fi
- done
- prefix="${saved_prefix}"
+ saved_prefix="${prefix}"
+ prefix=$1
+ makefile=$2
+ shift 2
+ for cfg; do
+ if enabled $cfg; then
+ upname="`toupper $cfg`"
+ echo "${prefix}_${upname}=yes" >> $makefile
+ fi
+ done
+ prefix="${saved_prefix}"
}
print_config_h() {
- saved_prefix="${prefix}"
- prefix=$1
- header=$2
- shift 2
- for cfg; do
- upname="`toupper $cfg`"
- if enabled $cfg; then
- echo "#define ${prefix}_${upname} 1" >> $header
- else
- echo "#define ${prefix}_${upname} 0" >> $header
- fi
- done
- prefix="${saved_prefix}"
+ saved_prefix="${prefix}"
+ prefix=$1
+ header=$2
+ shift 2
+ for cfg; do
+ upname="`toupper $cfg`"
+ if enabled $cfg; then
+ echo "#define ${prefix}_${upname} 1" >> $header
+ else
+ echo "#define ${prefix}_${upname} 0" >> $header
+ fi
+ done
+ prefix="${saved_prefix}"
}
print_config_vars_h() {
- header=$1
- shift
- while [ $# -gt 0 ]; do
- upname="`toupper $1`"
- echo "#define ${upname} $2" >> $header
- shift 2
- done
+ header=$1
+ shift
+ while [ $# -gt 0 ]; do
+ upname="`toupper $1`"
+ echo "#define ${upname} $2" >> $header
+ shift 2
+ done
}
print_webm_license() {
- saved_prefix="${prefix}"
- destination=$1
- prefix="$2"
- suffix="$3"
- shift 3
- cat <<EOF > ${destination}
+ saved_prefix="${prefix}"
+ destination=$1
+ prefix="$2"
+ suffix="$3"
+ shift 3
+ cat <<EOF > ${destination}
${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
${prefix} ${suffix}
${prefix} Use of this source code is governed by a BSD-style license${suffix}
@@ -1390,43 +1384,43 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s
${prefix} in the file PATENTS. All contributing project authors may${suffix}
${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
EOF
- prefix="${saved_prefix}"
+ prefix="${saved_prefix}"
}
process_targets() {
- true;
+ true;
}
process_detect() {
- true;
+ true;
}
enable_feature logging
logfile="config.log"
self=$0
process() {
- cmdline_args="$@"
- process_cmdline "$@"
- if enabled child; then
- echo "# ${self} $@" >> ${logfile}
- else
- echo "# ${self} $@" > ${logfile}
- fi
- post_process_common_cmdline
- post_process_cmdline
- process_toolchain
- process_detect
- process_targets
-
- OOT_INSTALLS="${OOT_INSTALLS}"
- if enabled source_path_used; then
- # Prepare the PWD for building.
- for f in ${OOT_INSTALLS}; do
- install -D "${source_path}/$f" "$f"
- done
- fi
- cp "${source_path}/build/make/Makefile" .
-
- clean_temp_files
- true
+ cmdline_args="$@"
+ process_cmdline "$@"
+ if enabled child; then
+ echo "# ${self} $@" >> ${logfile}
+ else
+ echo "# ${self} $@" > ${logfile}
+ fi
+ post_process_common_cmdline
+ post_process_cmdline
+ process_toolchain
+ process_detect
+ process_targets
+
+ OOT_INSTALLS="${OOT_INSTALLS}"
+ if enabled source_path_used; then
+ # Prepare the PWD for building.
+ for f in ${OOT_INSTALLS}; do
+ install -D "${source_path}/$f" "$f"
+ done
+ fi
+ cp "${source_path}/build/make/Makefile" .
+
+ clean_temp_files
+ true
}
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh
index 79072259349..dcce78255d4 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh
@@ -295,23 +295,8 @@ generate_vcproj() {
case "$target" in
x86*)
case "$name" in
- obj_int_extract)
- tag Tool \
- Name="VCCLCompilerTool" \
- Optimization="0" \
- AdditionalIncludeDirectories="$incs" \
- PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
- RuntimeLibrary="$debug_runtime" \
- WarningLevel="3" \
- DebugInformationFormat="1" \
- $warn_64bit \
- ;;
vpx)
tag Tool \
- Name="VCPreBuildEventTool" \
- CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
-
- tag Tool \
Name="VCCLCompilerTool" \
Optimization="0" \
AdditionalIncludeDirectories="$incs" \
@@ -347,11 +332,6 @@ generate_vcproj() {
case "$target" in
x86*)
case "$name" in
- obj_int_extract)
- tag Tool \
- Name="VCLinkerTool" \
- GenerateDebugInformation="true" \
- ;;
*)
tag Tool \
Name="VCLinkerTool" \
@@ -400,25 +380,8 @@ generate_vcproj() {
case "$target" in
x86*)
case "$name" in
- obj_int_extract)
- tag Tool \
- Name="VCCLCompilerTool" \
- Optimization="2" \
- FavorSizeorSpeed="1" \
- AdditionalIncludeDirectories="$incs" \
- PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
- RuntimeLibrary="$release_runtime" \
- UsePrecompiledHeader="0" \
- WarningLevel="3" \
- DebugInformationFormat="0" \
- $warn_64bit \
- ;;
vpx)
tag Tool \
- Name="VCPreBuildEventTool" \
- CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
-
- tag Tool \
Name="VCCLCompilerTool" \
Optimization="2" \
FavorSizeorSpeed="1" \
@@ -456,11 +419,6 @@ generate_vcproj() {
case "$target" in
x86*)
case "$name" in
- obj_int_extract)
- tag Tool \
- Name="VCLinkerTool" \
- GenerateDebugInformation="true" \
- ;;
*)
tag Tool \
Name="VCLinkerTool" \
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
index 56b9a3b50b4..643ebd634be 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -262,15 +262,9 @@ case "$target" in
asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
;;
arm*)
+ platforms[0]="ARM"
asm_Debug_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
asm_Release_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
- if [ "$name" = "obj_int_extract" ]; then
- # We don't want to build this tool for the target architecture,
- # but for an architecture we can run locally during the build.
- platforms[0]="Win32"
- else
- platforms[0]="ARM"
- fi
;;
*) die "Unsupported target $target!"
;;
@@ -400,23 +394,13 @@ generate_vcxproj() {
if [ "$hostplat" == "ARM" ]; then
hostplat=Win32
fi
- open_tag PreBuildEvent
- tag_content Command "call obj_int_extract.bat &quot;$src_path_bare&quot; $hostplat\\\$(Configuration)"
- close_tag PreBuildEvent
fi
open_tag ClCompile
if [ "$config" = "Debug" ]; then
opt=Disabled
runtime=$debug_runtime
curlibs=$debug_libs
- case "$name" in
- obj_int_extract)
- debug=DEBUG
- ;;
- *)
- debug=_DEBUG
- ;;
- esac
+ debug=_DEBUG
else
opt=MaxSpeed
runtime=$release_runtime
@@ -424,14 +408,7 @@ generate_vcxproj() {
tag_content FavorSizeOrSpeed Speed
debug=NDEBUG
fi
- case "$name" in
- obj_int_extract)
- extradefines=";_CONSOLE"
- ;;
- *)
- extradefines=";$defines"
- ;;
- esac
+ extradefines=";$defines"
tag_content Optimization $opt
tag_content AdditionalIncludeDirectories "$incs;%(AdditionalIncludeDirectories)"
tag_content PreprocessorDefinitions "WIN32;$debug;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE$extradefines;%(PreprocessorDefinitions)"
@@ -451,10 +428,6 @@ generate_vcxproj() {
case "$proj_kind" in
exe)
open_tag Link
- if [ "$name" != "obj_int_extract" ]; then
- tag_content AdditionalDependencies "$curlibs;%(AdditionalDependencies)"
- tag_content AdditionalLibraryDirectories "$libdirs;%(AdditionalLibraryDirectories)"
- fi
tag_content GenerateDebugInformation true
# Console is the default normally, but if
# AppContainerApplication is set, we need to override it.
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh b/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh
index fb91b87894b..89fa681864a 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh
@@ -18,15 +18,19 @@ set -e
devnull='> /dev/null 2>&1'
BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+ --disable-examples
+ --disable-libyuv
+ --disable-unit-tests"
DIST_DIR="_dist"
FRAMEWORK_DIR="VPX.framework"
HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
MAKE_JOBS=1
-LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
ORIG_PWD="$(pwd)"
TARGETS="arm64-darwin-gcc
- armv6-darwin-gcc
armv7-darwin-gcc
armv7s-darwin-gcc
x86-iphonesimulator-gcc
@@ -42,8 +46,8 @@ build_target() {
mkdir "${target}"
cd "${target}"
- eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
- --disable-docs ${EXTRA_CONFIGURE_ARGS} ${devnull}
+ eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+ ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${devnull}
export DIST_DIR
eval make -j ${MAKE_JOBS} dist ${devnull}
cd "${old_pwd}"
@@ -58,9 +62,6 @@ target_to_preproc_symbol() {
arm64-*)
echo "__aarch64__"
;;
- armv6-*)
- echo "__ARM_ARCH_6__"
- ;;
armv7-*)
echo "__ARM_ARCH_7A__"
;;
@@ -176,8 +177,13 @@ build_framework() {
# Trap function. Cleans up the subtree used to build all targets contained in
# $TARGETS.
cleanup() {
+ local readonly res=$?
cd "${ORIG_PWD}"
+ if [ $res -ne 0 ]; then
+ elog "build exited with error ($res)"
+ fi
+
if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
rm -rf "${BUILD_ROOT}"
fi
@@ -187,14 +193,21 @@ iosbuild_usage() {
cat << EOF
Usage: ${0##*/} [arguments]
--help: Display this message and exit.
+ --extra-configure-args <args>: Extra args to pass when configuring libvpx.
--jobs: Number of make jobs.
--preserve-build-output: Do not delete the build directory.
--show-build-output: Show output from each library build.
+ --targets <targets>: Override default target list. Defaults:
+ ${TARGETS}
--verbose: Output information about the environment and each stage of the
build.
EOF
}
+elog() {
+ echo "${0##*/} failed because: $@" 1>&2
+}
+
vlog() {
if [ "${VERBOSE}" = "yes" ]; then
echo "$@"
@@ -224,6 +237,10 @@ while [ -n "$1" ]; do
--show-build-output)
devnull=
;;
+ --targets)
+ TARGETS="$2"
+ shift
+ ;;
--verbose)
VERBOSE=yes
;;
@@ -239,6 +256,7 @@ if [ "${VERBOSE}" = "yes" ]; then
cat << EOF
BUILD_ROOT=${BUILD_ROOT}
DIST_DIR=${DIST_DIR}
+ CONFIGURE_ARGS=${CONFIGURE_ARGS}
EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
FRAMEWORK_DIR=${FRAMEWORK_DIR}
HEADER_DIR=${HEADER_DIR}
@@ -252,3 +270,5 @@ EOF
fi
build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+echo " ${TARGETS}"
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c b/chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c
deleted file mode 100644
index 2e50f387fa5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c
+++ /dev/null
@@ -1,857 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-typedef enum {
- OUTPUT_FMT_PLAIN,
- OUTPUT_FMT_RVDS,
- OUTPUT_FMT_GAS,
- OUTPUT_FMT_C_HEADER,
-} output_fmt_t;
-
-int log_msg(const char *fmt, ...) {
- int res;
- va_list ap;
- va_start(ap, fmt);
- res = vfprintf(stderr, fmt, ap);
- va_end(ap);
- return res;
-}
-
-#if defined(__GNUC__) && __GNUC__
-
-#if defined(FORCE_PARSE_ELF)
-
-#if defined(__MACH__)
-#undef __MACH__
-#endif
-
-#if !defined(__ELF__)
-#define __ELF__
-#endif
-#endif
-
-#if defined(__MACH__)
-
-#include <mach-o/loader.h>
-#include <mach-o/nlist.h>
-
-int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
- switch (mode) {
- case OUTPUT_FMT_RVDS:
- printf("%-40s EQU %5d\n", name, val);
- return 0;
- case OUTPUT_FMT_GAS:
- printf(".set %-40s, %5d\n", name, val);
- return 0;
- case OUTPUT_FMT_C_HEADER:
- printf("#define %-40s %5d\n", name, val);
- return 0;
- default:
- log_msg("Unsupported mode: %d", mode);
- return 1;
- }
-}
-
-int parse_macho(uint8_t *base_buf, size_t sz, output_fmt_t mode) {
- int i, j;
- struct mach_header header;
- uint8_t *buf = base_buf;
- int base_data_section = 0;
- int bits = 0;
-
- /* We can read in mach_header for 32 and 64 bit architectures
- * because it's identical to mach_header_64 except for the last
- * element (uint32_t reserved), which we don't use. Then, when
- * we know which architecture we're looking at, increment buf
- * appropriately.
- */
- memcpy(&header, buf, sizeof(struct mach_header));
-
- if (header.magic == MH_MAGIC) {
- if (header.cputype == CPU_TYPE_ARM
- || header.cputype == CPU_TYPE_X86) {
- bits = 32;
- buf += sizeof(struct mach_header);
- } else {
- log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
- goto bail;
- }
- } else if (header.magic == MH_MAGIC_64) {
- if (header.cputype == CPU_TYPE_X86_64) {
- bits = 64;
- buf += sizeof(struct mach_header_64);
- } else {
- log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
- goto bail;
- }
- } else {
- log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
- MH_MAGIC, MH_MAGIC_64, header.magic);
- goto bail;
- }
-
- if (header.filetype != MH_OBJECT) {
- log_msg("Bad filetype for object file. Currently only tested for MH_OBJECT.\n");
- goto bail;
- }
-
- for (i = 0; i < header.ncmds; i++) {
- struct load_command lc;
-
- memcpy(&lc, buf, sizeof(struct load_command));
-
- if (lc.cmd == LC_SEGMENT) {
- uint8_t *seg_buf = buf;
- struct section s;
- struct segment_command seg_c;
-
- memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
- seg_buf += sizeof(struct segment_command);
-
- /* Although each section is given it's own offset, nlist.n_value
- * references the offset of the first section. This isn't
- * apparent without debug information because the offset of the
- * data section is the same as the first section. However, with
- * debug sections mixed in, the offset of the debug section
- * increases but n_value still references the first section.
- */
- if (seg_c.nsects < 1) {
- log_msg("Not enough sections\n");
- goto bail;
- }
-
- memcpy(&s, seg_buf, sizeof(struct section));
- base_data_section = s.offset;
- } else if (lc.cmd == LC_SEGMENT_64) {
- uint8_t *seg_buf = buf;
- struct section_64 s;
- struct segment_command_64 seg_c;
-
- memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
- seg_buf += sizeof(struct segment_command_64);
-
- /* Explanation in LG_SEGMENT */
- if (seg_c.nsects < 1) {
- log_msg("Not enough sections\n");
- goto bail;
- }
-
- memcpy(&s, seg_buf, sizeof(struct section_64));
- base_data_section = s.offset;
- } else if (lc.cmd == LC_SYMTAB) {
- if (base_data_section != 0) {
- struct symtab_command sc;
- uint8_t *sym_buf = base_buf;
- uint8_t *str_buf = base_buf;
-
- memcpy(&sc, buf, sizeof(struct symtab_command));
-
- if (sc.cmdsize != sizeof(struct symtab_command)) {
- log_msg("Can't find symbol table!\n");
- goto bail;
- }
-
- sym_buf += sc.symoff;
- str_buf += sc.stroff;
-
- for (j = 0; j < sc.nsyms; j++) {
- /* Location of string is cacluated each time from the
- * start of the string buffer. On darwin the symbols
- * are prefixed by "_", so we bump the pointer by 1.
- * The target value is defined as an int in *_asm_*_offsets.c,
- * which is 4 bytes on all targets we currently use.
- */
- if (bits == 32) {
- struct nlist nl;
- int val;
-
- memcpy(&nl, sym_buf, sizeof(struct nlist));
- sym_buf += sizeof(struct nlist);
-
- memcpy(&val, base_buf + base_data_section + nl.n_value,
- sizeof(val));
- print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
- } else { /* if (bits == 64) */
- struct nlist_64 nl;
- int val;
-
- memcpy(&nl, sym_buf, sizeof(struct nlist_64));
- sym_buf += sizeof(struct nlist_64);
-
- memcpy(&val, base_buf + base_data_section + nl.n_value,
- sizeof(val));
- print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
- }
- }
- }
- }
-
- buf += lc.cmdsize;
- }
-
- return 0;
-bail:
- return 1;
-
-}
-
-#elif defined(__ELF__)
-#include "elf.h"
-
-#define COPY_STRUCT(dst, buf, ofst, sz) do {\
- if(ofst + sizeof((*(dst))) > sz) goto bail;\
- memcpy(dst, buf+ofst, sizeof((*(dst))));\
- } while(0)
-
-#define ENDIAN_ASSIGN(val, memb) do {\
- if(!elf->le_data) {log_msg("Big Endian data not supported yet!\n");goto bail;}\
- (val) = (memb);\
- } while(0)
-
-#define ENDIAN_ASSIGN_IN_PLACE(memb) do {\
- ENDIAN_ASSIGN(memb, memb);\
- } while(0)
-
-typedef struct {
- uint8_t *buf; /* Buffer containing ELF data */
- size_t sz; /* Buffer size */
- int le_data; /* Data is little-endian */
- unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
- int bits; /* 32 or 64 */
- Elf32_Ehdr hdr32;
- Elf64_Ehdr hdr64;
-} elf_obj_t;
-
-int parse_elf_header(elf_obj_t *elf) {
- int res;
- /* Verify ELF Magic numbers */
- COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
- res = elf->e_ident[EI_MAG0] == ELFMAG0;
- res &= elf->e_ident[EI_MAG1] == ELFMAG1;
- res &= elf->e_ident[EI_MAG2] == ELFMAG2;
- res &= elf->e_ident[EI_MAG3] == ELFMAG3;
- res &= elf->e_ident[EI_CLASS] == ELFCLASS32
- || elf->e_ident[EI_CLASS] == ELFCLASS64;
- res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
-
- if (!res) goto bail;
-
- elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
-
- /* Read in relevant values */
- if (elf->e_ident[EI_CLASS] == ELFCLASS32) {
- elf->bits = 32;
- COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
-
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
- } else { /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
- elf->bits = 64;
- COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
-
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
- ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
- }
-
- return 0;
-bail:
- log_msg("Failed to parse ELF file header");
- return 1;
-}
-
-int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64) {
- if (hdr32) {
- if (idx >= elf->hdr32.e_shnum)
- goto bail;
-
- COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
- elf->sz);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
- ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
- } else { /* if (hdr64) */
- if (idx >= elf->hdr64.e_shnum)
- goto bail;
-
- COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
- elf->sz);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
- ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
- }
-
- return 0;
-bail:
- return 1;
-}
-
-const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
- if (elf->bits == 32) {
- Elf32_Shdr shdr;
-
- if (parse_elf_section(elf, s_idx, &shdr, NULL)) {
- log_msg("Failed to parse ELF string table: section %d, index %d\n",
- s_idx, idx);
- return "";
- }
-
- return (char *)(elf->buf + shdr.sh_offset + idx);
- } else { /* if (elf->bits == 64) */
- Elf64_Shdr shdr;
-
- if (parse_elf_section(elf, s_idx, NULL, &shdr)) {
- log_msg("Failed to parse ELF string table: section %d, index %d\n",
- s_idx, idx);
- return "";
- }
-
- return (char *)(elf->buf + shdr.sh_offset + idx);
- }
-}
-
-int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64) {
- if (sym32) {
- COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
- ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
- ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
- ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
- ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
- ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
- ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
- } else { /* if (sym64) */
- COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
- ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
- ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
- ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
- ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
- ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
- ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
- }
- return 0;
-bail:
- return 1;
-}
-
-int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
- elf_obj_t elf;
- unsigned int ofst;
- int i;
- Elf32_Off strtab_off32;
- Elf64_Off strtab_off64; /* save String Table offset for later use */
-
- memset(&elf, 0, sizeof(elf));
- elf.buf = buf;
- elf.sz = sz;
-
- /* Parse Header */
- if (parse_elf_header(&elf))
- goto bail;
-
- if (elf.bits == 32) {
- Elf32_Shdr shdr;
- for (i = 0; i < elf.hdr32.e_shnum; i++) {
- parse_elf_section(&elf, i, &shdr, NULL);
-
- if (shdr.sh_type == SHT_STRTAB) {
- char strtsb_name[128];
-
- strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
- if (!(strcmp(strtsb_name, ".shstrtab"))) {
- /* log_msg("found section: %s\n", strtsb_name); */
- strtab_off32 = shdr.sh_offset;
- break;
- }
- }
- }
- } else { /* if (elf.bits == 64) */
- Elf64_Shdr shdr;
- for (i = 0; i < elf.hdr64.e_shnum; i++) {
- parse_elf_section(&elf, i, NULL, &shdr);
-
- if (shdr.sh_type == SHT_STRTAB) {
- char strtsb_name[128];
-
- strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
- if (!(strcmp(strtsb_name, ".shstrtab"))) {
- /* log_msg("found section: %s\n", strtsb_name); */
- strtab_off64 = shdr.sh_offset;
- break;
- }
- }
- }
- }
-
- /* Parse all Symbol Tables */
- if (elf.bits == 32) {
- Elf32_Shdr shdr;
- for (i = 0; i < elf.hdr32.e_shnum; i++) {
- parse_elf_section(&elf, i, &shdr, NULL);
-
- if (shdr.sh_type == SHT_SYMTAB) {
- for (ofst = shdr.sh_offset;
- ofst < shdr.sh_offset + shdr.sh_size;
- ofst += shdr.sh_entsize) {
- Elf32_Sym sym;
-
- parse_elf_symbol(&elf, ofst, &sym, NULL);
-
- /* For all OBJECTS (data objects), extract the value from the
- * proper data segment.
- */
- /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
- log_msg("found data object %s\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name));
- */
-
- if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
- && sym.st_size == 4) {
- Elf32_Shdr dhdr;
- int val = 0;
- char section_name[128];
-
- parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
-
- /* For explanition - refer to _MSC_VER version of code */
- strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
- /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
-
- if (strcmp(section_name, ".bss")) {
- if (sizeof(val) != sym.st_size) {
- /* The target value is declared as an int in
- * *_asm_*_offsets.c, which is 4 bytes on all
- * targets we currently use. Complain loudly if
- * this is not true.
- */
- log_msg("Symbol size is wrong\n");
- goto bail;
- }
-
- memcpy(&val,
- elf.buf + dhdr.sh_offset + sym.st_value,
- sym.st_size);
- }
-
- if (!elf.le_data) {
- log_msg("Big Endian data not supported yet!\n");
- goto bail;
- }
-
- switch (mode) {
- case OUTPUT_FMT_RVDS:
- printf("%-40s EQU %5d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- case OUTPUT_FMT_GAS:
- printf(".equ %-40s, %5d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- case OUTPUT_FMT_C_HEADER:
- printf("#define %-40s %5d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- default:
- printf("%s = %d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- }
- }
- }
- }
- }
- } else { /* if (elf.bits == 64) */
- Elf64_Shdr shdr;
- for (i = 0; i < elf.hdr64.e_shnum; i++) {
- parse_elf_section(&elf, i, NULL, &shdr);
-
- if (shdr.sh_type == SHT_SYMTAB) {
- for (ofst = shdr.sh_offset;
- ofst < shdr.sh_offset + shdr.sh_size;
- ofst += shdr.sh_entsize) {
- Elf64_Sym sym;
-
- parse_elf_symbol(&elf, ofst, NULL, &sym);
-
- /* For all OBJECTS (data objects), extract the value from the
- * proper data segment.
- */
- /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
- log_msg("found data object %s\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name));
- */
-
- if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
- && sym.st_size == 4) {
- Elf64_Shdr dhdr;
- int val = 0;
- char section_name[128];
-
- parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
-
- /* For explanition - refer to _MSC_VER version of code */
- strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
- /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
-
- if ((strcmp(section_name, ".bss"))) {
- if (sizeof(val) != sym.st_size) {
- /* The target value is declared as an int in
- * *_asm_*_offsets.c, which is 4 bytes on all
- * targets we currently use. Complain loudly if
- * this is not true.
- */
- log_msg("Symbol size is wrong\n");
- goto bail;
- }
-
- memcpy(&val,
- elf.buf + dhdr.sh_offset + sym.st_value,
- sym.st_size);
- }
-
- if (!elf.le_data) {
- log_msg("Big Endian data not supported yet!\n");
- goto bail;
- }
-
- switch (mode) {
- case OUTPUT_FMT_RVDS:
- printf("%-40s EQU %5d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- case OUTPUT_FMT_GAS:
- printf(".equ %-40s, %5d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- break;
- default:
- printf("%s = %d\n",
- parse_elf_string_table(&elf,
- shdr.sh_link,
- sym.st_name),
- val);
- }
- }
- }
- }
- }
- }
-
- if (mode == OUTPUT_FMT_RVDS)
- printf(" END\n");
-
- return 0;
-bail:
- log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
- return 1;
-}
-
-#endif
-#endif /* defined(__GNUC__) && __GNUC__ */
-
-
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
-/* See "Microsoft Portable Executable and Common Object File Format Specification"
- for reference.
-*/
-#define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
-#define get_le16(x) ((*(x)) | (*(x+1)) << 8)
-
-int parse_coff(uint8_t *buf, size_t sz) {
- unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
- unsigned int sectionrawdata_ptr;
- unsigned int i;
- uint8_t *ptr;
- uint32_t symoffset;
-
- char **sectionlist; // this array holds all section names in their correct order.
- // it is used to check if the symbol is in .bss or .rdata section.
-
- nsections = get_le16(buf + 2);
- symtab_ptr = get_le32(buf + 8);
- symtab_sz = get_le32(buf + 12);
- strtab_ptr = symtab_ptr + symtab_sz * 18;
-
- if (nsections > 96) {
- log_msg("Too many sections\n");
- return 1;
- }
-
- sectionlist = malloc(nsections * sizeof(sectionlist));
-
- if (sectionlist == NULL) {
- log_msg("Allocating first level of section list failed\n");
- return 1;
- }
-
- // log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
-
- /*
- The size of optional header is always zero for an obj file. So, the section header
- follows the file header immediately.
- */
-
- ptr = buf + 20; // section header
-
- for (i = 0; i < nsections; i++) {
- char sectionname[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
- strncpy(sectionname, ptr, 8);
- // log_msg("COFF: Parsing section %s\n",sectionname);
-
- sectionlist[i] = malloc(strlen(sectionname) + 1);
-
- if (sectionlist[i] == NULL) {
- log_msg("Allocating storage for %s failed\n", sectionname);
- goto bail;
- }
- strcpy(sectionlist[i], sectionname);
-
- // check if it's .rdata and is not a COMDAT section.
- if (!strcmp(sectionname, ".rdata") &&
- (get_le32(ptr + 36) & 0x1000) == 0) {
- sectionrawdata_ptr = get_le32(ptr + 20);
- }
-
- ptr += 40;
- }
-
- // log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
- // log_msg("COFF: raw data pointer ofset for section .rdata is %u\n", sectionrawdata_ptr);
-
- /* The compiler puts the data with non-zero offset in .rdata section, but puts the data with
- zero offset in .bss section. So, if the data in in .bss section, set offset=0.
- Note from Wiki: In an object module compiled from C, the bss section contains
- the local variables (but not functions) that were declared with the static keyword,
- except for those with non-zero initial values. (In C, static variables are initialized
- to zero by default.) It also contains the non-local (both extern and static) variables
- that are also initialized to zero (either explicitly or by default).
- */
- // move to symbol table
- /* COFF symbol table:
- offset field
- 0 Name(*)
- 8 Value
- 12 SectionNumber
- 14 Type
- 16 StorageClass
- 17 NumberOfAuxSymbols
- */
- ptr = buf + symtab_ptr;
-
- for (i = 0; i < symtab_sz; i++) {
- int16_t section = get_le16(ptr + 12); // section number
-
- if (section > 0 && ptr[16] == 2) {
- // if(section > 0 && ptr[16] == 3 && get_le32(ptr+8)) {
-
- if (get_le32(ptr)) {
- char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
- strncpy(name, ptr, 8);
- // log_msg("COFF: Parsing symbol %s\n",name);
- /* The 64bit Windows compiler doesn't prefix with an _.
- * Check what's there, and bump if necessary
- */
- if (name[0] == '_')
- printf("%-40s EQU ", name + 1);
- else
- printf("%-40s EQU ", name);
- } else {
- // log_msg("COFF: Parsing symbol %s\n",
- // buf + strtab_ptr + get_le32(ptr+4));
- if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
- printf("%-40s EQU ",
- buf + strtab_ptr + get_le32(ptr + 4) + 1);
- else
- printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
- }
-
- if (!(strcmp(sectionlist[section - 1], ".bss"))) {
- symoffset = 0;
- } else {
- symoffset = get_le32(buf + sectionrawdata_ptr + get_le32(ptr + 8));
- }
-
- // log_msg(" Section: %d\n",section);
- // log_msg(" Class: %d\n",ptr[16]);
- // log_msg(" Address: %u\n",get_le32(ptr+8));
- // log_msg(" Offset: %u\n", symoffset);
-
- printf("%5d\n", symoffset);
- }
-
- ptr += 18;
- }
-
- printf(" END\n");
-
- for (i = 0; i < nsections; i++) {
- free(sectionlist[i]);
- }
-
- free(sectionlist);
-
- return 0;
-bail:
-
- for (i = 0; i < nsections; i++) {
- free(sectionlist[i]);
- }
-
- free(sectionlist);
-
- return 1;
-}
-#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */
-
-int main(int argc, char **argv) {
- output_fmt_t mode = OUTPUT_FMT_PLAIN;
- const char *f;
- uint8_t *file_buf;
- int res;
- FILE *fp;
- long int file_size;
-
- if (argc < 2 || argc > 3) {
- fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
- fprintf(stderr, " <obj file>\tobject file to parse\n");
- fprintf(stderr, "Output Formats:\n");
- fprintf(stderr, " gas - compatible with GNU assembler\n");
- fprintf(stderr, " rvds - compatible with armasm\n");
- fprintf(stderr, " cheader - c/c++ header file\n");
- goto bail;
- }
-
- f = argv[2];
-
- if (!strcmp(argv[1], "rvds"))
- mode = OUTPUT_FMT_RVDS;
- else if (!strcmp(argv[1], "gas"))
- mode = OUTPUT_FMT_GAS;
- else if (!strcmp(argv[1], "cheader"))
- mode = OUTPUT_FMT_C_HEADER;
- else
- f = argv[1];
-
- fp = fopen(f, "rb");
-
- if (!fp) {
- perror("Unable to open file");
- goto bail;
- }
-
- if (fseek(fp, 0, SEEK_END)) {
- perror("stat");
- goto bail;
- }
-
- file_size = ftell(fp);
- file_buf = malloc(file_size);
-
- if (!file_buf) {
- perror("malloc");
- goto bail;
- }
-
- rewind(fp);
-
- if (fread(file_buf, sizeof(char), file_size, fp) != file_size) {
- perror("read");
- goto bail;
- }
-
- if (fclose(fp)) {
- perror("close");
- goto bail;
- }
-
-#if defined(__GNUC__) && __GNUC__
-#if defined(__MACH__)
- res = parse_macho(file_buf, file_size, mode);
-#elif defined(__ELF__)
- res = parse_elf(file_buf, file_size, mode);
-#endif
-#endif
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
- res = parse_coff(file_buf, file_size);
-#endif
-
- free(file_buf);
-
- if (!res)
- return EXIT_SUCCESS;
-
-bail:
- return EXIT_FAILURE;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl b/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl
index 0872414cbb5..6753ee776a2 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl
@@ -376,17 +376,18 @@ if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
last;
}
+ if (/HAVE_MSA=yes/) {
+ @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+ last;
+ }
}
close CONFIG_FILE;
mips;
-} elsif ($opts{arch} eq 'armv5te') {
- @ALL_ARCHS = filter(qw/edsp/);
- arm;
} elsif ($opts{arch} eq 'armv6') {
- @ALL_ARCHS = filter(qw/edsp media/);
+ @ALL_ARCHS = filter(qw/media/);
arm;
-} elsif ($opts{arch} eq 'armv7') {
- @ALL_ARCHS = filter(qw/edsp media neon_asm neon/);
+} elsif ($opts{arch} =~ /armv7\w?/) {
+ @ALL_ARCHS = filter(qw/media neon_asm neon/);
@REQUIRES = filter(keys %required ? keys %required : qw/media/);
&require(@REQUIRES);
arm;
diff --git a/chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat b/chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat
deleted file mode 100644
index dfa3b908390..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat
+++ /dev/null
@@ -1,15 +0,0 @@
-REM Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-REM
-REM Use of this source code is governed by a BSD-style license
-REM that can be found in the LICENSE file in the root of the source
-REM tree. An additional intellectual property rights grant can be found
-REM in the file PATENTS. All contributing project authors may
-REM be found in the AUTHORS file in the root of the source tree.
-echo on
-
-REM Arguments:
-REM %1 - Relative path to the directory containing the vp8 source directory.
-REM %2 - Path to obj_int_extract.exe.
-cl /I. /I%1 /nologo /c "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
-%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-
diff --git a/chromium/third_party/libvpx/source/libvpx/configure b/chromium/third_party/libvpx/source/libvpx/configure
index 3ed976c83bd..98542855a3e 100755
--- a/chromium/third_party/libvpx/source/libvpx/configure
+++ b/chromium/third_party/libvpx/source/libvpx/configure
@@ -26,19 +26,18 @@ Advanced options:
${toggle_unit_tests} unit tests
${toggle_decode_perf_tests} build decoder perf tests with unit tests
${toggle_encode_perf_tests} build encoder perf tests with unit tests
+ --cpu=CPU tune for the specified CPU (ARM: cortex-a8, X86: sse3)
--libc=PATH path to alternate libc
--size-limit=WxH max size to allow in the decoder
--as={yasm|nasm|auto} use specified assembler [auto, yasm preferred]
--sdk-path=PATH path to root of sdk (android builds only)
- ${toggle_fast_unaligned} don't use unaligned accesses, even when
- supported by hardware [auto]
${toggle_codec_srcs} in/exclude codec library source code
${toggle_debug_libs} in/exclude debug version of libraries
${toggle_static_msvcrt} use static MSVCRT (VS builds only)
+ ${toggle_vp9_highbitdepth} use VP9 high bit depth (10/12) profiles
${toggle_vp8} VP8 codec support
${toggle_vp9} VP9 codec support
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
- ${toggle_mem_tracker} track memory usage
${toggle_postproc} postprocessing
${toggle_vp9_postproc} vp9 specific postprocessing
${toggle_multithread} multithreaded encoding and decoding
@@ -56,6 +55,8 @@ Advanced options:
${toggle_postproc_visualizer} macro block / block level visualizers
${toggle_multi_res_encoding} enable multiple-resolution encoding
${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser
+ ${toggle_vp9_temporal_denoising}
+ enable vp9 temporal denoising
${toggle_webm_io} enable input from and output to WebM container
${toggle_libyuv} enable libyuv
@@ -93,10 +94,6 @@ EOF
# all_platforms is a list of all supported target platforms. Maintain
# alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv5te-android-gcc"
-all_platforms="${all_platforms} armv5te-linux-rvct"
-all_platforms="${all_platforms} armv5te-linux-gcc"
-all_platforms="${all_platforms} armv5te-none-rvct"
all_platforms="${all_platforms} armv6-darwin-gcc"
all_platforms="${all_platforms} armv6-linux-rvct"
all_platforms="${all_platforms} armv6-linux-gcc"
@@ -112,12 +109,6 @@ all_platforms="${all_platforms} armv7-win32-vs12"
all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} mips32-linux-gcc"
all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc32-darwin8-gcc"
-all_platforms="${all_platforms} ppc32-darwin9-gcc"
-all_platforms="${all_platforms} ppc32-linux-gcc"
-all_platforms="${all_platforms} ppc64-darwin8-gcc"
-all_platforms="${all_platforms} ppc64-darwin9-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
all_platforms="${all_platforms} sparc-solaris-gcc"
all_platforms="${all_platforms} x86-android-gcc"
all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -128,6 +119,7 @@ all_platforms="${all_platforms} x86-darwin10-gcc"
all_platforms="${all_platforms} x86-darwin11-gcc"
all_platforms="${all_platforms} x86-darwin12-gcc"
all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-darwin14-gcc"
all_platforms="${all_platforms} x86-iphonesimulator-gcc"
all_platforms="${all_platforms} x86-linux-gcc"
all_platforms="${all_platforms} x86-linux-icc"
@@ -145,6 +137,7 @@ all_platforms="${all_platforms} x86_64-darwin10-gcc"
all_platforms="${all_platforms} x86_64-darwin11-gcc"
all_platforms="${all_platforms} x86_64-darwin12-gcc"
all_platforms="${all_platforms} x86_64-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-darwin14-gcc"
all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
all_platforms="${all_platforms} x86_64-linux-gcc"
all_platforms="${all_platforms} x86_64-linux-icc"
@@ -161,6 +154,7 @@ all_platforms="${all_platforms} universal-darwin10-gcc"
all_platforms="${all_platforms} universal-darwin11-gcc"
all_platforms="${all_platforms} universal-darwin12-gcc"
all_platforms="${all_platforms} universal-darwin13-gcc"
+all_platforms="${all_platforms} universal-darwin14-gcc"
all_platforms="${all_platforms} generic-gnu"
# all_targets is a list of all targets that can be configured
@@ -206,7 +200,7 @@ enable_feature install_libs
enable_feature static
enable_feature optimizations
-enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature dependency_tracking
enable_feature spatial_resampling
enable_feature multithread
enable_feature os_support
@@ -243,8 +237,6 @@ ARCH_LIST="
mips
x86
x86_64
- ppc32
- ppc64
"
ARCH_EXT_LIST="
edsp
@@ -254,7 +246,7 @@ ARCH_EXT_LIST="
mips32
dspr2
-
+ msa
mips64
mmx
@@ -265,8 +257,6 @@ ARCH_EXT_LIST="
sse4_1
avx
avx2
-
- altivec
"
HAVE_LIST="
${ARCH_EXT_LIST}
@@ -279,11 +269,11 @@ HAVE_LIST="
"
EXPERIMENT_LIST="
spatial_svc
- vp9_temporal_denoising
fp_mb_stats
emulate_hardware
"
CONFIG_LIST="
+ dependency_tracking
external_build
install_docs
install_bins
@@ -301,10 +291,6 @@ CONFIG_LIST="
codec_srcs
debug_libs
- fast_unaligned
- mem_manager
- mem_tracker
- mem_checks
dequant_tokens
dc_recon
@@ -334,6 +320,7 @@ CONFIG_LIST="
encode_perf_tests
multi_res_encoding
temporal_denoising
+ vp9_temporal_denoising
coefficient_range_checking
vp9_highbitdepth
experimental
@@ -341,6 +328,7 @@ CONFIG_LIST="
${EXPERIMENT_LIST}
"
CMDLINE_SELECT="
+ dependency_tracking
external_build
extra_warnings
werror
@@ -364,7 +352,6 @@ CMDLINE_SELECT="
libc
as
size_limit
- fast_unaligned
codec_srcs
debug_libs
@@ -377,7 +364,6 @@ CMDLINE_SELECT="
${CODECS}
${CODEC_FAMILIES}
static_msvcrt
- mem_tracker
spatial_resampling
realtime_only
onthefly_bitpacking
@@ -393,6 +379,7 @@ CMDLINE_SELECT="
encode_perf_tests
multi_res_encoding
temporal_denoising
+ vp9_temporal_denoising
coefficient_range_checking
vp9_highbitdepth
experimental
@@ -451,8 +438,6 @@ process_targets() {
enabled child || write_common_config_banner
enabled universal || write_common_target_config_h ${BUILD_PFX}vpx_config.h
- # TODO: add host tools target (obj_int_extract, etc)
-
# For fat binaries, call configure recursively to configure for each
# binary architecture to be included.
if enabled universal; then
@@ -616,12 +601,6 @@ process_toolchain() {
universal-darwin*)
darwin_ver=${tgt_os##darwin}
- # Snow Leopard (10.6/darwin10) dropped support for PPC
- # Include PPC support for all prior versions
- if [ $darwin_ver -lt 10 ]; then
- fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
- fi
-
# Tiger (10.4/darwin8) brought support for x86
if [ $darwin_ver -ge 8 ]; then
fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
@@ -722,7 +701,7 @@ process_toolchain() {
esac
# Other toolchain specific defaults
- case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+ case $toolchain in x86*|universal*) soft_enable postproc;; esac
if enabled postproc_visualizer; then
enabled postproc || die "postproc_visualizer requires postproc to be enabled"
diff --git a/chromium/third_party/libvpx/source/libvpx/examples.mk b/chromium/third_party/libvpx/source/libvpx/examples.mk
index fd67a44dfe9..4ff1de4eeae 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples.mk
+++ b/chromium/third_party/libvpx/source/libvpx/examples.mk
@@ -338,6 +338,7 @@ $(foreach proj,$(call enabled,PROJECTS),\
#
%.dox: %.c
@echo " [DOXY] $@"
+ @mkdir -p $(dir $@)
@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
@echo " \includelineno $(<F)" >> $@
@echo "*/" >> $@
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c b/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c
index fbc0f4a6f19..a3843bed336 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c
@@ -36,9 +36,9 @@
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
-#include "./md5_utils.h"
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../md5_utils.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
#include "./vpx_config.h"
static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c b/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c
index 9423e38ffac..36f7d80e127 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c
@@ -59,8 +59,8 @@
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
#include "./vpx_config.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/postproc.c b/chromium/third_party/libvpx/source/libvpx/examples/postproc.c
index c74347c4c71..e34426a6194 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/postproc.c
@@ -46,8 +46,8 @@
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
#include "./vpx_config.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c b/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c
index b068f552405..f8c35255fa2 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c
@@ -15,15 +15,22 @@
#include <stdlib.h>
#include <string.h>
-#include "./vp9/encoder/vp9_resize.h"
+#include "../vp9/encoder/vp9_resize.h"
-static void usage(char *progname) {
+static const char *exec_name = NULL;
+
+static void usage() {
printf("Usage:\n");
printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
- progname);
+ exec_name);
printf("<output_yuv> [<frames>]\n");
}
+void usage_exit() {
+ usage();
+ exit(EXIT_FAILURE);
+}
+
static int parse_dim(char *v, int *width, int *height) {
char *x = strchr(v, 'x');
if (x == NULL)
@@ -47,9 +54,11 @@ int main(int argc, char *argv[]) {
int f, frames;
int width, height, target_width, target_height;
+ exec_name = argv[0];
+
if (argc < 5) {
printf("Incorrect parameters:\n");
- usage(argv[0]);
+ usage();
return 1;
}
@@ -57,25 +66,25 @@ int main(int argc, char *argv[]) {
fout = argv[4];
if (!parse_dim(argv[2], &width, &height)) {
printf("Incorrect parameters: %s\n", argv[2]);
- usage(argv[0]);
+ usage();
return 1;
}
if (!parse_dim(argv[3], &target_width, &target_height)) {
printf("Incorrect parameters: %s\n", argv[3]);
- usage(argv[0]);
+ usage();
return 1;
}
fpin = fopen(fin, "rb");
if (fpin == NULL) {
printf("Can't open file %s to read\n", fin);
- usage(argv[0]);
+ usage();
return 1;
}
fpout = fopen(fout, "wb");
if (fpout == NULL) {
printf("Can't open file %s to write\n", fout);
- usage(argv[0]);
+ usage();
return 1;
}
if (argc >= 6)
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c b/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c
index 851adc42ea1..5555baac22e 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c
@@ -50,8 +50,8 @@
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c b/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c
index c58b014f76a..08a21668542 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c
@@ -82,8 +82,8 @@
#include "vpx/vpx_decoder.h"
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
#include "./vpx_config.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c
index f20c246daf2..e805c258747 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c
@@ -101,8 +101,8 @@
#include "vpx/vpx_encoder.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c
index 653ae948224..0ec83ddccdf 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c
@@ -53,8 +53,8 @@
#include "vpx/vpx_encoder.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
index 9f50dc7cf52..e623567b8fe 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -8,292 +8,730 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+/*
+ * This is an example demonstrating multi-resolution encoding in VP8.
+ * High-resolution input video is down-sampled to lower-resolutions. The
+ * encoder then encodes the video and outputs multiple bitstreams with
+ * different resolutions.
+ *
+ * This test also allows for settings temporal layers for each spatial layer.
+ * Different number of temporal layers per spatial stream may be used.
+ * Currently up to 3 temporal layers per spatial stream (encoder) are supported
+ * in this test.
+ */
-// This is an example demonstrating multi-resolution encoding in VP8.
-// High-resolution input video is down-sampled to lower-resolutions. The
-// encoder then encodes the video and outputs multiple bitstreams with
-// different resolutions.
-//
-// Configure with --enable-multi-res-encoding flag to enable this example.
+#include "./vpx_config.h"
#include <stdio.h>
#include <stdlib.h>
+#include <stdarg.h>
#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include <sys/time.h>
+#if USE_POSIX_MMAP
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+#include "vpx_ports/vpx_timer.h"
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx_ports/mem_ops.h"
+#include "./tools_common.h"
+#define interface (vpx_codec_vp8_cx())
+#define fourcc 0x30385056
+
+void usage_exit() {
+ exit(EXIT_FAILURE);
+}
+
+/*
+ * The input video frame is downsampled several times to generate a multi-level
+ * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
+ * levels required. For example, if the size of input video is 1280x720,
+ * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
+ * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
+ * 320x180(level 2) respectively.
+ */
+
+/* Number of encoders (spatial resolutions) used in this test. */
+#define NUM_ENCODERS 3
+
+/* Maximum number of temporal layers allowed for this test. */
+#define MAX_NUM_TEMPORAL_LAYERS 3
+/* This example uses the scaler function in libyuv. */
#include "third_party/libyuv/include/libyuv/basic_types.h"
#include "third_party/libyuv/include/libyuv/scale.h"
#include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
+int (*read_frame_p)(FILE *f, vpx_image_t *img);
-#include "./tools_common.h"
-#include "./video_writer.h"
+static int read_frame(FILE *f, vpx_image_t *img) {
+ size_t nbytes, to_read;
+ int res = 1;
-// The input video frame is downsampled several times to generate a
-// multi-level hierarchical structure. kNumEncoders is defined as the number
-// of encoding levels required. For example, if the size of input video is
-// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
-// outputs 3 bitstreams with resolution of 1280x720(level 0),
-// 640x360(level 1), and 320x180(level 2) respectively.
-#define kNumEncoders 3
+ to_read = img->w*img->h*3/2;
+ nbytes = fread(img->planes[0], 1, to_read, f);
+ if(nbytes != to_read) {
+ res = 0;
+ if(nbytes > 0)
+ printf("Warning: Read partial frame. Check your width & height!\n");
+ }
+ return res;
+}
-static const char *exec_name;
+static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+ size_t nbytes, to_read;
+ int res = 1;
+ int plane;
-void usage_exit() {
- fprintf(stderr,
- "Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
- exec_name);
- exit(EXIT_FAILURE);
+ for (plane = 0; plane < 3; plane++)
+ {
+ unsigned char *ptr;
+ int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
+ int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
+ int r;
+
+ /* Determine the correct plane based on the image format. The for-loop
+ * always counts in Y,U,V order, but this may not match the order of
+ * the data on disk.
+ */
+ switch (plane)
+ {
+ case 1:
+ ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+ break;
+ case 2:
+ ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+ break;
+ default:
+ ptr = img->planes[plane];
+ }
+
+ for (r = 0; r < h; r++)
+ {
+ to_read = w;
+
+ nbytes = fread(ptr, 1, to_read, f);
+ if(nbytes != to_read) {
+ res = 0;
+ if(nbytes > 0)
+ printf("Warning: Read partial frame. Check your width & height!\n");
+ break;
+ }
+
+ ptr += img->stride[plane];
+ }
+ if (!res)
+ break;
+ }
+
+ return res;
}
-int main(int argc, char *argv[]) {
- int frame_cnt = 0;
- FILE *infile = NULL;
- VpxVideoWriter *writers[kNumEncoders];
- vpx_codec_ctx_t codec[kNumEncoders];
- vpx_codec_enc_cfg_t cfg[kNumEncoders];
- vpx_image_t raw[kNumEncoders];
- const VpxInterface *const encoder = get_vpx_encoder_by_name("vp8");
- // Currently, only realtime mode is supported in multi-resolution encoding.
- const int arg_deadline = VPX_DL_REALTIME;
- int i;
- int width = 0;
- int height = 0;
- int frame_avail = 0;
- int got_data = 0;
-
- // Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
- // don't need to know PSNR, which will skip PSNR calculation and save
- // encoding time.
- int show_psnr = 0;
- uint64_t psnr_sse_total[kNumEncoders] = {0};
- uint64_t psnr_samples_total[kNumEncoders] = {0};
- double psnr_totals[kNumEncoders][4] = {{0, 0}};
- int psnr_count[kNumEncoders] = {0};
-
- // Set the required target bitrates for each resolution level.
- // If target bitrate for highest-resolution level is set to 0,
- // (i.e. target_bitrate[0]=0), we skip encoding at that level.
- unsigned int target_bitrate[kNumEncoders] = {1000, 500, 100};
-
- // Enter the frame rate of the input video.
- const int framerate = 30;
- // Set down-sampling factor for each resolution level.
- // dsf[0] controls down sampling from level 0 to level 1;
- // dsf[1] controls down sampling from level 1 to level 2;
- // dsf[2] is not used.
- vpx_rational_t dsf[kNumEncoders] = {{2, 1}, {2, 1}, {1, 1}};
-
- exec_name = argv[0];
-
- if (!encoder)
- die("Unsupported codec.");
-
- // exe_name, input width, input height, input file,
- // output file 1, output file 2, output file 3, psnr on/off
- if (argc != (5 + kNumEncoders))
- die("Invalid number of input options.");
-
- printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
-
- width = strtol(argv[1], NULL, 0);
- height = strtol(argv[2], NULL, 0);
-
- if (width < 16 || width % 2 || height < 16 || height % 2)
- die("Invalid resolution: %ldx%ld", width, height);
-
- // Open input video file for encoding
- if (!(infile = fopen(argv[3], "rb")))
- die("Failed to open %s for reading", argv[3]);
-
- show_psnr = strtol(argv[kNumEncoders + 4], NULL, 0);
-
- // Populate default encoder configuration
- for (i = 0; i < kNumEncoders; ++i) {
- vpx_codec_err_t res =
- vpx_codec_enc_config_default(encoder->codec_interface(), &cfg[i], 0);
- if (res != VPX_CODEC_OK) {
- printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
- return EXIT_FAILURE;
+static void write_ivf_file_header(FILE *outfile,
+ const vpx_codec_enc_cfg_t *cfg,
+ int frame_cnt) {
+ char header[32];
+
+ if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+ return;
+ header[0] = 'D';
+ header[1] = 'K';
+ header[2] = 'I';
+ header[3] = 'F';
+ mem_put_le16(header+4, 0); /* version */
+ mem_put_le16(header+6, 32); /* headersize */
+ mem_put_le32(header+8, fourcc); /* headersize */
+ mem_put_le16(header+12, cfg->g_w); /* width */
+ mem_put_le16(header+14, cfg->g_h); /* height */
+ mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
+ mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
+ mem_put_le32(header+24, frame_cnt); /* length */
+ mem_put_le32(header+28, 0); /* unused */
+
+ (void) fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_header(FILE *outfile,
+ const vpx_codec_cx_pkt_t *pkt)
+{
+ char header[12];
+ vpx_codec_pts_t pts;
+
+ if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+ return;
+
+ pts = pkt->data.frame.pts;
+ mem_put_le32(header, pkt->data.frame.sz);
+ mem_put_le32(header+4, pts&0xFFFFFFFF);
+ mem_put_le32(header+8, pts >> 32);
+
+ (void) fwrite(header, 1, 12, outfile);
+}
+
+/* Temporal scaling parameters */
+/* This sets all the temporal layer parameters given |num_temporal_layers|,
+ * including the target bit allocation across temporal layers. Bit allocation
+ * parameters will be passed in as user parameters in another version.
+ */
+static void set_temporal_layer_pattern(int num_temporal_layers,
+ vpx_codec_enc_cfg_t *cfg,
+ int bitrate,
+ int *layer_flags)
+{
+ assert(num_temporal_layers <= MAX_NUM_TEMPORAL_LAYERS);
+ switch (num_temporal_layers)
+ {
+ case 1:
+ {
+ /* 1-layer */
+ cfg->ts_number_layers = 1;
+ cfg->ts_periodicity = 1;
+ cfg->ts_rate_decimator[0] = 1;
+ cfg->ts_layer_id[0] = 0;
+ cfg->ts_target_bitrate[0] = bitrate;
+
+ // Update L only.
+ layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+ break;
+ }
+
+ case 2:
+ {
+ /* 2-layers, with sync point at first frame of layer 1. */
+ cfg->ts_number_layers = 2;
+ cfg->ts_periodicity = 2;
+ cfg->ts_rate_decimator[0] = 2;
+ cfg->ts_rate_decimator[1] = 1;
+ cfg->ts_layer_id[0] = 0;
+ cfg->ts_layer_id[1] = 1;
+ // Use 60/40 bit allocation as example.
+ cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+ cfg->ts_target_bitrate[1] = bitrate;
+
+ /* 0=L, 1=GF */
+ // ARF is used as predictor for all frames, and is only updated on
+ // key frame. Sync point every 8 frames.
+
+ // Layer 0: predict from L and ARF, update L and G.
+ layer_flags[0] = VP8_EFLAG_NO_REF_GF |
+ VP8_EFLAG_NO_UPD_ARF;
+
+ // Layer 1: sync point: predict from L and ARF, and update G.
+ layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+ VP8_EFLAG_NO_UPD_LAST |
+ VP8_EFLAG_NO_UPD_ARF;
+
+ // Layer 0, predict from L and ARF, update L.
+ layer_flags[2] = VP8_EFLAG_NO_REF_GF |
+ VP8_EFLAG_NO_UPD_GF |
+ VP8_EFLAG_NO_UPD_ARF;
+
+ // Layer 1: predict from L, G and ARF, and update G.
+ layer_flags[3] = VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_UPD_LAST |
+ VP8_EFLAG_NO_UPD_ENTROPY;
+
+ // Layer 0
+ layer_flags[4] = layer_flags[2];
+
+ // Layer 1
+ layer_flags[5] = layer_flags[3];
+
+ // Layer 0
+ layer_flags[6] = layer_flags[4];
+
+ // Layer 1
+ layer_flags[7] = layer_flags[5];
+ break;
}
- }
-
- // Update the default configuration according to needs of the application.
- // Highest-resolution encoder settings
- cfg[0].g_w = width;
- cfg[0].g_h = height;
- cfg[0].g_threads = 1;
- cfg[0].rc_dropframe_thresh = 30;
- cfg[0].rc_end_usage = VPX_CBR;
- cfg[0].rc_resize_allowed = 0;
- cfg[0].rc_min_quantizer = 4;
- cfg[0].rc_max_quantizer = 56;
- cfg[0].rc_undershoot_pct = 98;
- cfg[0].rc_overshoot_pct = 100;
- cfg[0].rc_buf_initial_sz = 500;
- cfg[0].rc_buf_optimal_sz = 600;
- cfg[0].rc_buf_sz = 1000;
- cfg[0].g_error_resilient = 1;
- cfg[0].g_lag_in_frames = 0;
- cfg[0].kf_mode = VPX_KF_AUTO; // VPX_KF_DISABLED
- cfg[0].kf_min_dist = 3000;
- cfg[0].kf_max_dist = 3000;
- cfg[0].rc_target_bitrate = target_bitrate[0];
- cfg[0].g_timebase.num = 1;
- cfg[0].g_timebase.den = framerate;
-
- // Other-resolution encoder settings
- for (i = 1; i < kNumEncoders; ++i) {
- cfg[i] = cfg[0];
- cfg[i].g_threads = 1;
- cfg[i].rc_target_bitrate = target_bitrate[i];
-
- // Note: Width & height of other-resolution encoders are calculated
- // from the highest-resolution encoder's size and the corresponding
- // down_sampling_factor.
+
+ case 3:
+ default:
{
- unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1;
- unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1;
- cfg[i].g_w = iw / dsf[i - 1].num;
- cfg[i].g_h = ih / dsf[i - 1].num;
+ // 3-layers structure where ARF is used as predictor for all frames,
+ // and is only updated on key frame.
+ // Sync points for layer 1 and 2 every 8 frames.
+ cfg->ts_number_layers = 3;
+ cfg->ts_periodicity = 4;
+ cfg->ts_rate_decimator[0] = 4;
+ cfg->ts_rate_decimator[1] = 2;
+ cfg->ts_rate_decimator[2] = 1;
+ cfg->ts_layer_id[0] = 0;
+ cfg->ts_layer_id[1] = 2;
+ cfg->ts_layer_id[2] = 1;
+ cfg->ts_layer_id[3] = 2;
+ // Use 40/20/40 bit allocation as example.
+ cfg->ts_target_bitrate[0] = 0.4f * bitrate;
+ cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+ cfg->ts_target_bitrate[2] = bitrate;
+
+ /* 0=L, 1=GF, 2=ARF */
+
+ // Layer 0: predict from L and ARF; update L and G.
+ layer_flags[0] = VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_REF_GF;
+
+ // Layer 2: sync point: predict from L and ARF; update none.
+ layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+ VP8_EFLAG_NO_UPD_GF |
+ VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_UPD_LAST |
+ VP8_EFLAG_NO_UPD_ENTROPY;
+
+ // Layer 1: sync point: predict from L and ARF; update G.
+ layer_flags[2] = VP8_EFLAG_NO_REF_GF |
+ VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_UPD_LAST;
+
+ // Layer 2: predict from L, G, ARF; update none.
+ layer_flags[3] = VP8_EFLAG_NO_UPD_GF |
+ VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_UPD_LAST |
+ VP8_EFLAG_NO_UPD_ENTROPY;
+
+ // Layer 0: predict from L and ARF; update L.
+ layer_flags[4] = VP8_EFLAG_NO_UPD_GF |
+ VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_REF_GF;
+
+ // Layer 2: predict from L, G, ARF; update none.
+ layer_flags[5] = layer_flags[3];
+
+ // Layer 1: predict from L, G, ARF; update G.
+ layer_flags[6] = VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_NO_UPD_LAST;
+
+ // Layer 2: predict from L, G, ARF; update none.
+ layer_flags[7] = layer_flags[3];
+ break;
+ }
}
+}
+
+/* The periodicity of the pattern given the number of temporal layers. */
+static int periodicity_to_num_layers[MAX_NUM_TEMPORAL_LAYERS] = {1, 8, 8};
+
+int main(int argc, char **argv)
+{
+ FILE *infile, *outfile[NUM_ENCODERS];
+ FILE *downsampled_input[NUM_ENCODERS - 1];
+ char filename[50];
+ vpx_codec_ctx_t codec[NUM_ENCODERS];
+ vpx_codec_enc_cfg_t cfg[NUM_ENCODERS];
+ int frame_cnt = 0;
+ vpx_image_t raw[NUM_ENCODERS];
+ vpx_codec_err_t res[NUM_ENCODERS];
+
+ int i;
+ long width;
+ long height;
+ int length_frame;
+ int frame_avail;
+ int got_data;
+ int flags = 0;
+ int layer_id = 0;
+
+ int layer_flags[VPX_TS_MAX_PERIODICITY * NUM_ENCODERS]
+ = {0};
+ int flag_periodicity;
+
+ /*Currently, only realtime mode is supported in multi-resolution encoding.*/
+ int arg_deadline = VPX_DL_REALTIME;
+
+ /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+ don't need to know PSNR, which will skip PSNR calculation and save
+ encoding time. */
+ int show_psnr = 0;
+ int key_frame_insert = 0;
+ uint64_t psnr_sse_total[NUM_ENCODERS] = {0};
+ uint64_t psnr_samples_total[NUM_ENCODERS] = {0};
+ double psnr_totals[NUM_ENCODERS][4] = {{0,0}};
+ int psnr_count[NUM_ENCODERS] = {0};
+
+ double cx_time = 0;
+ struct timeval tv1, tv2, difftv;
+
+ /* Set the required target bitrates for each resolution level.
+ * If target bitrate for highest-resolution level is set to 0,
+ * (i.e. target_bitrate[0]=0), we skip encoding at that level.
+ */
+ unsigned int target_bitrate[NUM_ENCODERS]={1000, 500, 100};
+
+ /* Enter the frame rate of the input video */
+ int framerate = 30;
+
+ /* Set down-sampling factor for each resolution level.
+ dsf[0] controls down sampling from level 0 to level 1;
+ dsf[1] controls down sampling from level 1 to level 2;
+ dsf[2] is not used. */
+ vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
+
+ /* Set the number of temporal layers for each encoder/resolution level,
+ * starting from highest resoln down to lowest resoln. */
+ unsigned int num_temporal_layers[NUM_ENCODERS] = {3, 3, 3};
+
+ if(argc!= (7 + 3 * NUM_ENCODERS))
+ die("Usage: %s <width> <height> <frame_rate> <infile> <outfile(s)> "
+ "<rate_encoder(s)> <temporal_layer(s)> <key_frame_insert> <output psnr?> \n",
+ argv[0]);
+
+ printf("Using %s\n",vpx_codec_iface_name(interface));
+
+ width = strtol(argv[1], NULL, 0);
+ height = strtol(argv[2], NULL, 0);
+ framerate = strtol(argv[3], NULL, 0);
+
+ if(width < 16 || width%2 || height <16 || height%2)
+ die("Invalid resolution: %ldx%ld", width, height);
+
+ /* Open input video file for encoding */
+ if(!(infile = fopen(argv[4], "rb")))
+ die("Failed to open %s for reading", argv[4]);
+
+ /* Open output file for each encoder to output bitstreams */
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ if(!target_bitrate[i])
+ {
+ outfile[i] = NULL;
+ continue;
+ }
- // Make width & height to be multiplier of 2.
- if ((cfg[i].g_w) % 2)
- cfg[i].g_w++;
-
- if ((cfg[i].g_h) % 2)
- cfg[i].g_h++;
- }
-
- // Open output file for each encoder to output bitstreams
- for (i = 0; i < kNumEncoders; ++i) {
- VpxVideoInfo info = {
- encoder->fourcc,
- cfg[i].g_w,
- cfg[i].g_h,
- {cfg[i].g_timebase.num, cfg[i].g_timebase.den}
- };
-
- if (!(writers[i] = vpx_video_writer_open(argv[i+4], kContainerIVF, &info)))
- die("Failed to open %s for writing", argv[i+4]);
- }
-
- // Allocate image for each encoder
- for (i = 0; i < kNumEncoders; ++i)
- if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
- die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
-
- // Initialize multi-encoder
- if (vpx_codec_enc_init_multi(&codec[0], encoder->codec_interface(), &cfg[0],
- kNumEncoders,
- show_psnr ? VPX_CODEC_USE_PSNR : 0, &dsf[0]))
- die_codec(&codec[0], "Failed to initialize encoder");
-
- // The extra encoding configuration parameters can be set as follows.
- for (i = 0; i < kNumEncoders; i++) {
- // Set encoding speed
- if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, -6))
- die_codec(&codec[i], "Failed to set cpu_used");
-
- // Set static threshold.
- if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
- die_codec(&codec[i], "Failed to set static threshold");
-
- // Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING
- // Enable denoising for the highest-resolution encoder.
- if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, i == 0))
- die_codec(&codec[0], "Failed to set noise_sensitivity");
- }
-
- frame_avail = 1;
- got_data = 0;
-
- while (frame_avail || got_data) {
- vpx_codec_iter_t iter[kNumEncoders] = {NULL};
- const vpx_codec_cx_pkt_t *pkt[kNumEncoders];
-
- frame_avail = vpx_img_read(&raw[0], infile);
-
- if (frame_avail) {
- for (i = 1; i < kNumEncoders; ++i) {
- vpx_image_t *const prev = &raw[i - 1];
-
- // Scale the image down a number of times by downsampling factor
- // FilterMode 1 or 2 give better psnr than FilterMode 0.
- I420Scale(prev->planes[VPX_PLANE_Y], prev->stride[VPX_PLANE_Y],
- prev->planes[VPX_PLANE_U], prev->stride[VPX_PLANE_U],
- prev->planes[VPX_PLANE_V], prev->stride[VPX_PLANE_V],
- prev->d_w, prev->d_h,
- raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
- raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
- raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
- raw[i].d_w, raw[i].d_h, 1);
- }
+ if(!(outfile[i] = fopen(argv[i+5], "wb")))
+ die("Failed to open %s for writing", argv[i+4]);
}
- // Encode frame.
- if (vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
- frame_cnt, 1, 0, arg_deadline)) {
- die_codec(&codec[0], "Failed to encode frame");
+ // Bitrates per spatial layer: overwrite default rates above.
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
}
- for (i = kNumEncoders - 1; i >= 0; i--) {
- got_data = 0;
-
- while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) {
- got_data = 1;
- switch (pkt[i]->kind) {
- case VPX_CODEC_CX_FRAME_PKT:
- vpx_video_writer_write_frame(writers[i], pkt[i]->data.frame.buf,
- pkt[i]->data.frame.sz, frame_cnt - 1);
- break;
- case VPX_CODEC_PSNR_PKT:
- if (show_psnr) {
- int j;
- psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
- psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
- for (j = 0; j < 4; j++)
- psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
- psnr_count[i]++;
- }
- break;
- default:
- break;
+ // Temporal layers per spatial layers: overwrite default settings above.
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+ if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
+ die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
+ num_temporal_layers);
+ }
+
+ /* Open file to write out each spatially downsampled input stream. */
+ for (i=0; i< NUM_ENCODERS - 1; i++)
+ {
+ // Highest resoln is encoder 0.
+ if (sprintf(filename,"ds%d.yuv",NUM_ENCODERS - i) < 0)
+ {
+ return EXIT_FAILURE;
+ }
+ downsampled_input[i] = fopen(filename,"wb");
+ }
+
+ key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+
+ show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+
+
+ /* Populate default encoder configuration */
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
+ if(res[i]) {
+ printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
+ return EXIT_FAILURE;
}
- printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
- (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
- fflush(stdout);
- }
}
- frame_cnt++;
- }
- printf("\n");
-
- fclose(infile);
-
- printf("Processed %d frames.\n", frame_cnt - 1);
- for (i = 0; i < kNumEncoders; ++i) {
- // Calculate PSNR and print it out
- if (show_psnr && psnr_count[i] > 0) {
- int j;
- double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
- psnr_sse_total[i]);
-
- fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
- fprintf(stderr, " %.3lf", ovpsnr);
- for (j = 0; j < 4; j++)
- fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+
+ /*
+ * Update the default configuration according to needs of the application.
+ */
+ /* Highest-resolution encoder settings */
+ cfg[0].g_w = width;
+ cfg[0].g_h = height;
+ cfg[0].rc_dropframe_thresh = 0;
+ cfg[0].rc_end_usage = VPX_CBR;
+ cfg[0].rc_resize_allowed = 0;
+ cfg[0].rc_min_quantizer = 2;
+ cfg[0].rc_max_quantizer = 56;
+ cfg[0].rc_undershoot_pct = 100;
+ cfg[0].rc_overshoot_pct = 15;
+ cfg[0].rc_buf_initial_sz = 500;
+ cfg[0].rc_buf_optimal_sz = 600;
+ cfg[0].rc_buf_sz = 1000;
+ cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+ cfg[0].g_lag_in_frames = 0;
+
+ /* Disable automatic keyframe placement */
+ /* Note: These 3 settings are copied to all levels. But, except the lowest
+ * resolution level, all other levels are set to VPX_KF_DISABLED internally.
+ */
+ cfg[0].kf_mode = VPX_KF_AUTO;
+ cfg[0].kf_min_dist = 3000;
+ cfg[0].kf_max_dist = 3000;
+
+ cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */
+ cfg[0].g_timebase.num = 1; /* Set fps */
+ cfg[0].g_timebase.den = framerate;
+
+ /* Other-resolution encoder settings */
+ for (i=1; i< NUM_ENCODERS; i++)
+ {
+ memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
+
+ cfg[i].rc_target_bitrate = target_bitrate[i];
+
+ /* Note: Width & height of other-resolution encoders are calculated
+ * from the highest-resolution encoder's size and the corresponding
+ * down_sampling_factor.
+ */
+ {
+ unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
+ unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
+ cfg[i].g_w = iw/dsf[i-1].num;
+ cfg[i].g_h = ih/dsf[i-1].num;
+ }
+
+ /* Make width & height to be multiplier of 2. */
+ // Should support odd size ???
+ if((cfg[i].g_w)%2)cfg[i].g_w++;
+ if((cfg[i].g_h)%2)cfg[i].g_h++;
+ }
+
+
+ // Set the number of threads per encode/spatial layer.
+ // (1, 1, 1) means no encoder threading.
+ cfg[0].g_threads = 2;
+ cfg[1].g_threads = 1;
+ cfg[2].g_threads = 1;
+
+ /* Allocate image for each encoder */
+ for (i=0; i< NUM_ENCODERS; i++)
+ if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+ die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+ if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+ read_frame_p = read_frame;
+ else
+ read_frame_p = read_frame_by_row;
+
+ for (i=0; i< NUM_ENCODERS; i++)
+ if(outfile[i])
+ write_ivf_file_header(outfile[i], &cfg[i], 0);
+
+ /* Temporal layers settings */
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ set_temporal_layer_pattern(num_temporal_layers[i],
+ &cfg[i],
+ cfg[i].rc_target_bitrate,
+ &layer_flags[i * VPX_TS_MAX_PERIODICITY]);
+ }
+
+ /* Initialize multi-encoder */
+ if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
+ (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
+ die_codec(&codec[0], "Failed to initialize encoder");
+
+ /* The extra encoding configuration parameters can be set as follows. */
+ /* Set encoding speed */
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ int speed = -6;
+ /* Lower speed for the lowest resolution. */
+ if (i == NUM_ENCODERS - 1) speed = -4;
+ if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
+ die_codec(&codec[i], "Failed to set cpu_used");
}
- if (vpx_codec_destroy(&codec[i]))
- die_codec(&codec[i], "Failed to destroy codec");
+ /* Set static threshold = 1 for all encoders */
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
+ die_codec(&codec[i], "Failed to set static threshold");
+ }
- vpx_img_free(&raw[i]);
- vpx_video_writer_close(writers[i]);
- }
- printf("\n");
+ /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
+ /* Enable denoising for the highest-resolution encoder. */
+ if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
+ die_codec(&codec[0], "Failed to set noise_sensitivity");
+ for ( i=1; i< NUM_ENCODERS; i++)
+ {
+ if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
+ die_codec(&codec[i], "Failed to set noise_sensitivity");
+ }
+
+ /* Set the number of token partitions */
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ if(vpx_codec_control(&codec[i], VP8E_SET_TOKEN_PARTITIONS, 1))
+ die_codec(&codec[i], "Failed to set static threshold");
+ }
+
+ /* Set the max intra target bitrate */
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ unsigned int max_intra_size_pct =
+ (int)(((double)cfg[0].rc_buf_optimal_sz * 0.5) * framerate / 10);
+ if(vpx_codec_control(&codec[i], VP8E_SET_MAX_INTRA_BITRATE_PCT,
+ max_intra_size_pct))
+ die_codec(&codec[i], "Failed to set static threshold");
+ //printf("%d %d \n",i,max_intra_size_pct);
+ }
+
+ frame_avail = 1;
+ got_data = 0;
+
+ while(frame_avail || got_data)
+ {
+ vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
+ const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
+
+ flags = 0;
+ frame_avail = read_frame_p(infile, &raw[0]);
+
+ if(frame_avail)
+ {
+ for ( i=1; i<NUM_ENCODERS; i++)
+ {
+ /*Scale the image down a number of times by downsampling factor*/
+ /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
+ I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
+ raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
+ raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
+ raw[i-1].d_w, raw[i-1].d_h,
+ raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+ raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+ raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+ raw[i].d_w, raw[i].d_h, 1);
+ /* Write out down-sampled input. */
+ length_frame = cfg[i].g_w * cfg[i].g_h *3/2;
+ if (fwrite(raw[i].planes[0], 1, length_frame,
+ downsampled_input[NUM_ENCODERS - i - 1]) !=
+ length_frame)
+ {
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ /* Set the flags (reference and update) for all the encoders.*/
+ for ( i=0; i<NUM_ENCODERS; i++)
+ {
+ layer_id = cfg[i].ts_layer_id[frame_cnt % cfg[i].ts_periodicity];
+ flags = 0;
+ flag_periodicity = periodicity_to_num_layers
+ [num_temporal_layers[i] - 1];
+ flags = layer_flags[i * VPX_TS_MAX_PERIODICITY +
+ frame_cnt % flag_periodicity];
+ // Key frame flag for first frame.
+ if (frame_cnt == 0)
+ {
+ flags |= VPX_EFLAG_FORCE_KF;
+ }
+ if (frame_cnt > 0 && frame_cnt == key_frame_insert)
+ {
+ flags = VPX_EFLAG_FORCE_KF;
+ }
+
+ vpx_codec_control(&codec[i], VP8E_SET_FRAME_FLAGS, flags);
+ vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+ }
+
+ gettimeofday(&tv1, NULL);
+ /* Encode each frame at multi-levels */
+ /* Note the flags must be set to 0 in the encode call if they are set
+ for each frame with the vpx_codec_control(), as done above. */
+ if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+ frame_cnt, 1, 0, arg_deadline))
+ {
+ die_codec(&codec[0], "Failed to encode frame");
+ }
+ gettimeofday(&tv2, NULL);
+ timersub(&tv2, &tv1, &difftv);
+ cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+ for (i=NUM_ENCODERS-1; i>=0 ; i--)
+ {
+ got_data = 0;
+ while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
+ {
+ got_data = 1;
+ switch(pkt[i]->kind) {
+ case VPX_CODEC_CX_FRAME_PKT:
+ write_ivf_frame_header(outfile[i], pkt[i]);
+ (void) fwrite(pkt[i]->data.frame.buf, 1,
+ pkt[i]->data.frame.sz, outfile[i]);
+ break;
+ case VPX_CODEC_PSNR_PKT:
+ if (show_psnr)
+ {
+ int j;
+
+ psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+ psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+ for (j = 0; j < 4; j++)
+ {
+ psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+ }
+ psnr_count[i]++;
+ }
+
+ break;
+ default:
+ break;
+ }
+ printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
+ && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"");
+ fflush(stdout);
+ }
+ }
+ frame_cnt++;
+ }
+ printf("\n");
+ printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
+ 1000000 * (double)frame_cnt / (double)cx_time);
+
+ fclose(infile);
+
+ printf("Processed %ld frames.\n",(long int)frame_cnt-1);
+ for (i=0; i< NUM_ENCODERS; i++)
+ {
+ /* Calculate PSNR and print it out */
+ if ( (show_psnr) && (psnr_count[i]>0) )
+ {
+ int j;
+ double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+ psnr_sse_total[i]);
+
+ fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+
+ fprintf(stderr, " %.3lf", ovpsnr);
+ for (j = 0; j < 4; j++)
+ {
+ fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+ }
+ }
+
+ if(vpx_codec_destroy(&codec[i]))
+ die_codec(&codec[i], "Failed to destroy codec");
+
+ vpx_img_free(&raw[i]);
+
+ if(!outfile[i])
+ continue;
+
+ /* Try to rewrite the file header with the actual frame count */
+ if(!fseek(outfile[i], 0, SEEK_SET))
+ write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
+ fclose(outfile[i]);
+ }
+ printf("\n");
- return EXIT_SUCCESS;
+ return EXIT_SUCCESS;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c
index 5e29d808319..a2982821a42 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c
@@ -53,8 +53,8 @@
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c
index 3fcda0cd4bb..54275770d5f 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c
@@ -15,8 +15,8 @@
#include "vpx/vpx_encoder.h"
#include "vpx/vp8cx.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
index 53ede94d843..f4deb693b2f 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -19,14 +19,14 @@
#include <string.h>
#include <time.h>
-#include "./args.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../args.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
#include "vpx/svc_context.h"
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
-#include "./vpxstats.h"
+#include "../vpxstats.h"
static const arg_def_t skip_frames_arg =
ARG_DEF("s", "skip-frames", 1, "input frames to skip");
@@ -60,6 +60,11 @@ static const arg_def_t min_bitrate_arg =
ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate");
static const arg_def_t max_bitrate_arg =
ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
+static const arg_def_t lag_in_frame_arg =
+ ARG_DEF(NULL, "lag-in-frames", 1, "Number of frame to input before "
+ "generating any outputs");
+static const arg_def_t rc_end_usage_arg =
+ ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
#if CONFIG_VP9_HIGHBITDEPTH
static const struct arg_enum_list bitdepth_enum[] = {
@@ -80,11 +85,11 @@ static const arg_def_t *svc_args[] = {
&timebase_arg, &bitrate_arg, &skip_frames_arg, &spatial_layers_arg,
&kf_dist_arg, &scale_factors_arg, &passes_arg, &pass_arg,
&fpf_name_arg, &min_q_arg, &max_q_arg, &min_bitrate_arg,
- &max_bitrate_arg, &temporal_layers_arg,
+ &max_bitrate_arg, &temporal_layers_arg, &lag_in_frame_arg,
#if CONFIG_VP9_HIGHBITDEPTH
&bitdepth_arg,
#endif
- NULL
+ &rc_end_usage_arg, NULL
};
static const uint32_t default_frames_to_skip = 0;
@@ -207,6 +212,10 @@ static void parse_command_line(int argc, const char **argv_,
min_bitrate = arg_parse_uint(&arg);
} else if (arg_match(&arg, &max_bitrate_arg, argi)) {
max_bitrate = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &lag_in_frame_arg, argi)) {
+ enc_cfg->g_lag_in_frames = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &rc_end_usage_arg, argi)) {
+ enc_cfg->rc_end_usage = arg_parse_uint(&arg);
#if CONFIG_VP9_HIGHBITDEPTH
} else if (arg_match(&arg, &bitdepth_arg, argi)) {
enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
index ecae2fe6393..349875997b5 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -19,12 +19,12 @@
#include <string.h>
#include "./vpx_config.h"
-#include "vpx_ports/vpx_timer.h"
+#include "../vpx_ports/vpx_timer.h"
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
static const char *exec_name;
@@ -61,6 +61,15 @@ struct RateControlMetrics {
double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
// Actual encoding bitrate per layer (cumulative).
double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
+ // Average of the short-time encoder actual bitrate.
+ // TODO(marpan): Should we add these short-time stats for each layer?
+ double avg_st_encoding_bitrate;
+ // Variance of the short-time encoder actual bitrate.
+ double variance_st_encoding_bitrate;
+ // Window (number of frames) for computing short-timee encoding bitrate.
+ int window_size;
+ // Number of window measurements.
+ int window_count;
};
// Note: these rate control metrics assume only 1 key frame in the
@@ -92,6 +101,10 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
rc->layer_avg_frame_size[i] = 0.0;
rc->layer_avg_rate_mismatch[i] = 0.0;
}
+ rc->window_count = 0;
+ rc->window_size = 15;
+ rc->avg_st_encoding_bitrate = 0.0;
+ rc->variance_st_encoding_bitrate = 0.0;
}
static void printout_rate_control_summary(struct RateControlMetrics *rc,
@@ -99,6 +112,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
int frame_cnt) {
unsigned int i = 0;
int tot_num_frames = 0;
+ double perc_fluctuation = 0.0;
printf("Total number of processed frames: %d\n\n", frame_cnt -1);
printf("Rate control layer stats for %d layer(s):\n\n",
cfg->ts_number_layers);
@@ -125,6 +139,17 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
100.0 * num_dropped / rc->layer_input_frames[i]);
printf("\n");
}
+ rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+ rc->variance_st_encoding_bitrate =
+ rc->variance_st_encoding_bitrate / rc->window_count -
+ (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+ perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+ rc->avg_st_encoding_bitrate;
+ printf("Short-time stats, for window of %d frames: \n",rc->window_size);
+ printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+ rc->avg_st_encoding_bitrate,
+ sqrt(rc->variance_st_encoding_bitrate),
+ perc_fluctuation);
if ((frame_cnt - 1) != tot_num_frames)
die("Error: Number of input frames not equal to output! \n");
}
@@ -456,7 +481,11 @@ int main(int argc, char **argv) {
int layering_mode = 0;
int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
int flag_periodicity = 1;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
vpx_svc_layer_id_t layer_id = {0, 0};
+#else
+ vpx_svc_layer_id_t layer_id = {0};
+#endif
const VpxInterface *encoder = NULL;
FILE *infile = NULL;
struct RateControlMetrics rc;
@@ -469,6 +498,9 @@ int main(int argc, char **argv) {
#else
const int min_args = min_args_base;
#endif // CONFIG_VP9_HIGHBITDEPTH
+ double sum_bitrate = 0.0;
+ double sum_bitrate2 = 0.0;
+ double framerate = 30.0;
exec_name = argv[0];
// Check usage and arguments.
@@ -574,12 +606,17 @@ int main(int argc, char **argv) {
cfg.rc_resize_allowed = 0;
cfg.rc_min_quantizer = 2;
cfg.rc_max_quantizer = 56;
+ if (strncmp(encoder->name, "vp9", 3) == 0)
+ cfg.rc_max_quantizer = 52;
cfg.rc_undershoot_pct = 50;
cfg.rc_overshoot_pct = 50;
cfg.rc_buf_initial_sz = 500;
cfg.rc_buf_optimal_sz = 600;
cfg.rc_buf_sz = 1000;
+ // Use 1 thread as default.
+ cfg.g_threads = 1;
+
// Enable error resilient mode.
cfg.g_error_resilient = 1;
cfg.g_lag_in_frames = 0;
@@ -604,6 +641,7 @@ int main(int argc, char **argv) {
die("Failed to open %s for reading", argv[1]);
}
+ framerate = cfg.g_timebase.den / cfg.g_timebase.num;
// Open an output file for each stream.
for (i = 0; i < cfg.ts_number_layers; ++i) {
char file_name[PATH_MAX];
@@ -636,23 +674,28 @@ int main(int argc, char **argv) {
if (strncmp(encoder->name, "vp8", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
- vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+ vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+ vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
} else if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
- if (vpx_codec_control(&codec, VP9E_SET_SVC, 1)) {
+ vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+ vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+ if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0)) {
die_codec(&codec, "Failed to set SVC");
}
}
- vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+ if (strncmp(encoder->name, "vp8", 3) == 0) {
+ vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0);
+ }
vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
// This controls the maximum target size of the key frame.
// For generating smaller key frames, use a smaller max_intra_size_pct
// value, like 100 or 200.
{
- const int max_intra_size_pct = 200;
+ const int max_intra_size_pct = 900;
vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
max_intra_size_pct);
}
@@ -662,14 +705,21 @@ int main(int argc, char **argv) {
struct vpx_usec_timer timer;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
// Update the temporal layer_id. No spatial layers in this test.
layer_id.spatial_layer_id = 0;
+#endif
layer_id.temporal_layer_id =
cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+ } else if (strncmp(encoder->name, "vp8", 3) == 0) {
+ vpx_codec_control(&codec, VP8E_SET_TEMPORAL_LAYER_ID,
+ layer_id.temporal_layer_id);
}
flags = layer_flags[frame_cnt % flag_periodicity];
+ if (layering_mode == 0)
+ flags = 0;
frame_avail = vpx_img_read(&raw, infile);
if (frame_avail)
++rc.layer_input_frames[layer_id.temporal_layer_id];
@@ -705,6 +755,33 @@ int main(int argc, char **argv) {
++rc.layer_enc_frames[i];
}
}
+ // Update for short-time encoding bitrate states, for moving window
+ // of size rc->window, shifted by rc->window / 2.
+ // Ignore first window segment, due to key frame.
+ if (frame_cnt > rc.window_size) {
+ sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+ if (frame_cnt % rc.window_size == 0) {
+ rc.window_count += 1;
+ rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+ rc.variance_st_encoding_bitrate +=
+ (sum_bitrate / rc.window_size) *
+ (sum_bitrate / rc.window_size);
+ sum_bitrate = 0.0;
+ }
+ }
+ // Second shifted window.
+ if (frame_cnt > rc.window_size + rc.window_size / 2) {
+ sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+ if (frame_cnt > 2 * rc.window_size &&
+ frame_cnt % rc.window_size == 0) {
+ rc.window_count += 1;
+ rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+ rc.variance_st_encoding_bitrate +=
+ (sum_bitrate2 / rc.window_size) *
+ (sum_bitrate2 / rc.window_size);
+ sum_bitrate2 = 0.0;
+ }
+ }
break;
default:
break;
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.doxy_template b/chromium/third_party/libvpx/source/libvpx/libs.doxy_template
index 02e290242b4..5a8f847280e 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.doxy_template
+++ b/chromium/third_party/libvpx/source/libvpx/libs.doxy_template
@@ -36,7 +36,7 @@ DOXYFILE_ENCODING = UTF-8
# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
# by quotes) that should identify the project.
-PROJECT_NAME = "WebM VP8 Codec SDK"
+PROJECT_NAME = "WebM Codec SDK"
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.
@@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES = 30
SHOW_USED_FILES = YES
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES = NO
-
# The FILE_VERSION_FILTER tag can be used to specify a program or script that
# doxygen should invoke to get the current version for each file (typically from the
# version control system). Doxygen will invoke the program by executing (via
@@ -715,12 +709,6 @@ HTML_FOOTER =
HTML_STYLESHEET =
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS = YES
-
# If the GENERATE_HTMLHELP tag is set to YES, additional index files
# will be generated that can be used as input for tools like the
# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk
index f9f2d80702f..6eee0039c29 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.mk
+++ b/chromium/third_party/libvpx/source/libvpx/libs.mk
@@ -18,32 +18,6 @@ else
endif
#
-# Calculate platform- and compiler-specific offsets for hand coded assembly
-#
-ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
-define asm_offsets_template
-$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).S
- @echo " [CREATE] $$@"
- $$(qexec)LC_ALL=C grep $$(OFFSET_PATTERN) $$< | tr -d '$$$$\#' $$(ADS2GAS) > $$@
-$$(BUILD_PFX)$(2).S: $(2)
-CLEAN-OBJS += $$(BUILD_PFX)$(1) $(2).S
-endef
-else
- ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
-define asm_offsets_template
-$$(BUILD_PFX)$(1): obj_int_extract
-$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).o
- @echo " [CREATE] $$@"
- $$(qexec)./obj_int_extract rvds $$< $$(ADS2GAS) > $$@
-OBJS-yes += $$(BUILD_PFX)$(2).o
-CLEAN-OBJS += $$(BUILD_PFX)$(1)
-$$(filter %$$(ASM).o,$$(OBJS-yes)): $$(BUILD_PFX)$(1)
-endef
-endif # rvct
-endif # !gcc
-
-#
# Rule to generate runtime cpu detection files
#
define rtcd_h_template
@@ -80,6 +54,9 @@ CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
+include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
+CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
+
ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
VP8_PREFIX=vp8/
include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
@@ -205,33 +182,13 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
# based build systems.
libvpx_srcs.txt:
@echo " [CREATE] $@"
- @echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@
+ @echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
CLEAN-OBJS += libvpx_srcs.txt
ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
ifeq ($(CONFIG_MSVS),yes)
-obj_int_extract.bat: $(SRC_PATH_BARE)/build/$(MSVS_ARCH_DIR)/obj_int_extract.bat
- @cp $^ $@
-
-obj_int_extract.$(VCPROJ_SFX): obj_int_extract.bat
-obj_int_extract.$(VCPROJ_SFX): $(SRC_PATH_BARE)/build/make/obj_int_extract.c
- @echo " [CREATE] $@"
- $(qexec)$(GEN_VCPROJ) \
- --exe \
- --target=$(TOOLCHAIN) \
- --name=obj_int_extract \
- --ver=$(CONFIG_VS_VERSION) \
- --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
- --src-path-bare="$(SRC_PATH_BARE)" \
- $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
- --out=$@ $^ \
- -I. \
- -I"$(SRC_PATH_BARE)" \
-
-PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.$(VCPROJ_SFX)
-
vpx.def: $(call enabled,CODEC_EXPORTS)
@echo " [CREATE] $@"
$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
@@ -246,7 +203,7 @@ ASM_INCLUDES := \
vpx_config.asm \
vpx_ports/x86_abi_support.asm \
-vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def obj_int_extract.$(VCPROJ_SFX)
+vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
@echo " [CREATE] $@"
$(qexec)$(GEN_VCPROJ) \
$(if $(CONFIG_SHARED),--dll,--lib) \
@@ -276,25 +233,27 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
BUILD_LIBVPX_SO := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
+SO_VERSION_MAJOR := 2
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBVPX_SO := libvpx.$(VERSION_MAJOR).dylib
+LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
EXPORT_FILE := libvpx.syms
LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \
libvpx.dylib )
else
-LIBVPX_SO := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+LIBVPX_SO := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
EXPORT_FILE := libvpx.ver
-SYM_LINK := libvpx.so
LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \
- libvpx.so libvpx.so.$(VERSION_MAJOR) \
- libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR))
+ libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+ libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
endif
LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
$(notdir $(LIBVPX_SO_SYMLINKS))
$(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
$(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
libvpx.ver: $(call enabled,CODEC_EXPORTS)
@@ -377,7 +336,7 @@ CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
endif
#
-# Add assembler dependencies for configuration and offsets
+# Add assembler dependencies for configuration.
#
$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
@@ -402,7 +361,7 @@ libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
libvpx_test_srcs.txt:
@echo " [CREATE] $@"
- @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+ @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
CLEAN-OBJS += libvpx_test_srcs.txt
$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
@@ -535,7 +494,11 @@ libs.doxy: $(CODEC_DOC_SRCS)
@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
## Generate rtcd.h for all objects
+ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
$(OBJS-yes:.o=.d): $(RTCD)
+else
+$(OBJS-yes): $(RTCD)
+endif
## Update the global src list
SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS)
diff --git a/chromium/third_party/libvpx/source/libvpx/mainpage.dox b/chromium/third_party/libvpx/source/libvpx/mainpage.dox
index e2ec280027e..ec202fa4fb5 100644
--- a/chromium/third_party/libvpx/source/libvpx/mainpage.dox
+++ b/chromium/third_party/libvpx/source/libvpx/mainpage.dox
@@ -1,4 +1,4 @@
-/*!\mainpage WebM VP8 Codec SDK
+/*!\mainpage WebM Codec SDK
\section main_contents Page Contents
- \ref main_intro
@@ -6,11 +6,11 @@
- \ref main_support
\section main_intro Introduction
- Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your
- applications with the VP8 video codec, a high quality, royalty free, open
- source codec deployed on millions of computers and devices worldwide.
+ Welcome to the WebM Codec SDK. This SDK allows you to integrate your
+ applications with the VP8 and VP9 video codecs, high quality, royalty free,
+ open source codecs deployed on billions of computers and devices worldwide.
- This distribution of the WebM VP8 Codec SDK includes the following support:
+ This distribution of the WebM Codec SDK includes the following support:
\if vp8_encoder
- \ref vp8_encoder
@@ -28,12 +28,12 @@
- Read the \ref samples "sample code" for examples of how to interact with the
codec.
- \ref codec reference
- \if encoder
- - \ref encoder reference
- \endif
- \if decoder
- - \ref decoder reference
- \endif
+ \if encoder
+ - \ref encoder reference
+ \endif
+ \if decoder
+ - \ref decoder reference
+ \endif
\section main_support Support Options & FAQ
The WebM project is an open source project supported by its community. For
diff --git a/chromium/third_party/libvpx/source/libvpx/solution.mk b/chromium/third_party/libvpx/source/libvpx/solution.mk
index 2c8d29a2a1e..145adc0ddad 100644
--- a/chromium/third_party/libvpx/source/libvpx/solution.mk
+++ b/chromium/third_party/libvpx/source/libvpx/solution.mk
@@ -9,7 +9,7 @@
##
# libvpx reverse dependencies (targets that depend on libvpx)
-VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest obj_int_extract)
+VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest)
VPX_RDEPS=$(foreach vcp,\
$(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.$(VCPROJ_SFX)=):vpx)
@@ -17,7 +17,6 @@ vpx.sln: $(wildcard *.$(VCPROJ_SFX))
@echo " [CREATE] $@"
$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
$(if $(filter vpx.$(VCPROJ_SFX),$^),$(VPX_RDEPS)) \
- --dep=vpx:obj_int_extract \
--dep=test_libvpx:gtest \
--ver=$(CONFIG_VS_VERSION)\
--out=$@ $^
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx
index 3869d25bc40..6a5b60c063b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1060
+Version: 1305
License: BSD
License File: LICENSE
@@ -13,4 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
-cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
+cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.'
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h
index 5dfac7c86aa..08b2bb2ecf4 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h
@@ -22,6 +22,11 @@ extern "C" {
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h
index 1bd45c837f1..97936adb27d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h
@@ -113,15 +113,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert Q420 to I420.
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@@ -211,8 +202,6 @@ int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height);
#endif
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
-
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
// "dst_stride_y" number of bytes in a row of the dst_y plane.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
index a18014ca2c8..a0de89efd99 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
@@ -18,7 +18,6 @@
#include "libyuv/rotate.h"
// TODO(fbarchard): This set of functions should exactly match convert.h
-// Add missing Q420.
// TODO(fbarchard): Add tests. Create random content of right size and convert
// with C vs Opt and or to I420 and compare.
// TODO(fbarchard): Some of these functions lack parameter setting.
@@ -104,13 +103,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// TODO(fbarchard): Convert Q420 to ARGB.
-// LIBYUV_API
-// int Q420ToARGB(const uint8* src_y, int src_stride_y,
-// const uint8* src_yuy2, int src_stride_yuy2,
-// uint8* dst_argb, int dst_stride_argb,
-// int width, int height);
-
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@@ -123,6 +115,22 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@@ -184,8 +192,6 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
int dst_width, int dst_height);
#endif
-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
-
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h
index b1cf57f7dc0..d6c0e3d8703 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h
@@ -57,7 +57,6 @@ int I400Copy(const uint8* src_y, int src_stride_y,
int width, int height);
// TODO(fbarchard): I420ToM420
-// TODO(fbarchard): I420ToQ420
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
@@ -152,8 +151,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
-
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
index 90f43af04c3..c592fc2353e 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -61,6 +61,13 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
+// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+// Values in dither matrix from 0 to 255. 128 is best for no dither.
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ const uint8* dither8x8, int width, int height);
+
// Convert ARGB To ARGB1555.
LIBYUV_API
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@@ -105,6 +112,14 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v,
int width, int height);
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
// Convert ARGB To I411.
LIBYUV_API
int ARGBToI411(const uint8* src_argb, int src_stride_argb,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h
deleted file mode 100644
index b18bf053438..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT
-#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert Bayer RGB formats to I420.
-LIBYUV_API
-int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-LIBYUV_API
-int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-LIBYUV_API
-int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-LIBYUV_API
-int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
- BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
-
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- uint32 src_fourcc_bayer);
-
-// Convert I420 to Bayer RGB formats.
-LIBYUV_API
-int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-LIBYUV_API
-int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-LIBYUV_API
-int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-LIBYUV_API
-int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-// Temporary API mapper.
-#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
- I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
-
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height,
- uint32 dst_fourcc_bayer);
-
-// Convert Bayer RGB formats to ARGB.
-LIBYUV_API
-int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-LIBYUV_API
-int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-LIBYUV_API
-int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-LIBYUV_API
-int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- uint32 src_fourcc_bayer);
-
-// Converts ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height);
-
-LIBYUV_API
-int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height);
-
-// Temporary API mapper.
-#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h
index 4b3c870f91c..80e844bae3c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h
@@ -15,10 +15,6 @@
#include "libyuv/basic_types.h"
-#if defined(__native_client__)
-#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE
-#endif
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -43,6 +39,7 @@ extern "C" {
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR) || \
+ (defined(__i386__) && !defined(__SSE2__)) || \
(defined(_MSC_VER) && defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
@@ -51,48 +48,16 @@ extern "C" {
#define LIBYUV_SSSE3_ONLY
#endif
-// Enable for NaCL pepper 33 for bundle and AVX2 support.
-#if defined(__native_client__) && PPAPI_RELEASE >= 33
-#define NEW_BINUTILS
-#endif
-#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
#define LIBYUV_DISABLE_NEON
-#endif
+#endif // clang >= 3.5
+#endif // __clang__
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Effects:
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_ARGBCOPYALPHAROW_SSE2
-#define HAS_ARGBCOPYYTOALPHAROW_SSE2
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBPOLYNOMIALROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_RGBCOLORTABLEROW_X86
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELTOPLANEROW_SSE2
-#define HAS_SOBELXROW_SSE2
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSE2
-
// Conversions:
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
@@ -103,24 +68,21 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTOBAYERGGROW_SSE2
-#define HAS_ARGBTOBAYERROW_SSSE3
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
-#define HAS_COPYROW_X86
-#define HAS_HALFROW_SSE2
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
-#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
@@ -133,6 +95,7 @@ extern "C" {
#define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
+// #define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3
@@ -150,6 +113,8 @@ extern "C" {
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86
+#define HAS_SETROW_ERMS
+#define HAS_ARGBSETROW_X86
#define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2
@@ -160,6 +125,36 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
#endif
// The following are available on x64 Visual C:
@@ -186,26 +181,39 @@ extern "C" {
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
+// The following are available require VS2012. Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393
+#define HAS_I422TOABGRROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#endif
+
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-// Effects:
-#define HAS_ARGBPOLYNOMIALROW_AVX2
-#define HAS_ARGBSHUFFLEROW_AVX2
#define HAS_ARGBCOPYALPHAROW_AVX2
#define HAS_ARGBCOPYYTOALPHAROW_AVX2
-#endif
-
-// The following are require VS2012.
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
-#define HAS_HALFROW_AVX2
-#define HAS_I422TOARGBROW_AVX2
+#define HAS_COPYROW_AVX
#define HAS_INTERPOLATEROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
@@ -213,18 +221,25 @@ extern "C" {
#define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2
+#define HAS_YTOARGBROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
+// The following require HAS_I422TOARGBROW_AVX2
+#if defined(HAS_I422TOARGBROW_AVX2)
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#endif
+
// Effects:
#define HAS_ARGBADDROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
-#endif // defined(VISUALC_HAS_AVX2)
+#endif
+
// The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline.
@@ -245,106 +260,14 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBBLENDROW_SSE2
#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
#define HAS_MIRRORROW_SSE2
#endif
-// The following are available on arm64 platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-// #define HAS_I444TOARGBROW_NEON
-// #define HAS_I422TOARGBROW_NEON
-// #define HAS_I411TOARGBROW_NEON
-// #define HAS_I422TOBGRAROW_NEON
-// #define HAS_I422TOABGRROW_NEON
-// #define HAS_I422TORGBAROW_NEON
-// #define HAS_I422TORGB24ROW_NEON
-// #define HAS_I422TORAWROW_NEON
-// #define HAS_I422TORGB565ROW_NEON
-// #define HAS_I422TOARGB1555ROW_NEON
-// #define HAS_I422TOARGB4444ROW_NEON
-// #define HAS_YTOARGBROW_NEON
-// #define HAS_I400TOARGBROW_NEON
-// #define HAS_NV12TOARGBROW_NEON
-// #define HAS_NV21TOARGBROW_NEON
-// #define HAS_NV12TORGB565ROW_NEON
-// #define HAS_NV21TORGB565ROW_NEON
-// #define HAS_YUY2TOARGBROW_NEON
-// #define HAS_UYVYTOARGBROW_NEON
-#define HAS_SPLITUVROW_NEON
-#define HAS_MERGEUVROW_NEON
-#define HAS_COPYROW_NEON
-#define HAS_SETROW_NEON
-#define HAS_ARGBSETROWS_NEON
-#define HAS_MIRRORROW_NEON
-#define HAS_MIRRORUVROW_NEON
-#define HAS_ARGBMIRRORROW_NEON
-#define HAS_RGB24TOARGBROW_NEON
-#define HAS_RAWTOARGBROW_NEON
-// #define HAS_RGB565TOARGBROW_NEON
-// #define HAS_ARGB1555TOARGBROW_NEON
-// #define HAS_ARGB4444TOARGBROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORAWROW_NEON
-#define HAS_YUY2TOYROW_NEON
-#define HAS_UYVYTOYROW_NEON
-#define HAS_YUY2TOUV422ROW_NEON
-#define HAS_UYVYTOUV422ROW_NEON
-#define HAS_YUY2TOUVROW_NEON
-#define HAS_UYVYTOUVROW_NEON
-#define HAS_HALFROW_NEON
-#define HAS_ARGBTOBAYERROW_NEON
-#define HAS_ARGBTOBAYERGGROW_NEON
-#define HAS_ARGBSHUFFLEROW_NEON
-#define HAS_I422TOYUY2ROW_NEON
-#define HAS_I422TOUYVYROW_NEON
-// #define HAS_ARGBTORGB565ROW_NEON
-// #define HAS_ARGBTOARGB1555ROW_NEON
-// #define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTOYROW_NEON
-#define HAS_ARGBTOYJROW_NEON
-// #define HAS_ARGBTOUV444ROW_NEON
-// #define HAS_ARGBTOUV422ROW_NEON
-// #define HAS_ARGBTOUV411ROW_NEON
-// #define HAS_ARGBTOUVROW_NEON
-// #define HAS_ARGBTOUVJROW_NEON
-// #define HAS_BGRATOUVROW_NEON
-// #define HAS_ABGRTOUVROW_NEON
-// #define HAS_RGBATOUVROW_NEON
-// #define HAS_RGB24TOUVROW_NEON
-// #define HAS_RAWTOUVROW_NEON
-// #define HAS_RGB565TOUVROW_NEON
-// #define HAS_ARGB1555TOUVROW_NEON
-// #define HAS_ARGB4444TOUVROW_NEON
-// #define HAS_RGB565TOYROW_NEON
-// #define HAS_ARGB1555TOYROW_NEON
-// #define HAS_ARGB4444TOYROW_NEON
-// #define HAS_BGRATOYROW_NEON
-// #define HAS_ABGRTOYROW_NEON
-// #define HAS_RGBATOYROW_NEON
-// #define HAS_RGB24TOYROW_NEON
-// #define HAS_RAWTOYROW_NEON
-// #define HAS_INTERPOLATEROW_NEON
-// #define HAS_ARGBBLENDROW_NEON
-// #define HAS_ARGBATTENUATEROW_NEON
-// #define HAS_ARGBQUANTIZEROW_NEON
-// #define HAS_ARGBSHADEROW_NEON
-// #define HAS_ARGBGRAYROW_NEON
-// #define HAS_ARGBSEPIAROW_NEON
-// #define HAS_ARGBCOLORMATRIXROW_NEON
-#define HAS_ARGBMULTIPLYROW_NEON
-#define HAS_ARGBADDROW_NEON
-#define HAS_ARGBSUBTRACTROW_NEON
-#define HAS_SOBELROW_NEON
-#define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXYROW_NEON
-#define HAS_SOBELXROW_NEON
-#define HAS_SOBELYROW_NEON
-#endif
-
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_ABGRTOUVROW_NEON
#define HAS_ABGRTOYROW_NEON
#define HAS_ARGB1555TOARGBROW_NEON
@@ -355,7 +278,6 @@ extern "C" {
#define HAS_ARGB4444TOYROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTOBAYERROW_NEON
#define HAS_ARGBTOBAYERGGROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
@@ -363,14 +285,13 @@ extern "C" {
#define HAS_ARGBTOUV411ROW_NEON
#define HAS_ARGBTOUV422ROW_NEON
#define HAS_ARGBTOUV444ROW_NEON
-#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOUVJROW_NEON
-#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
-#define HAS_HALFROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I411TOARGBROW_NEON
#define HAS_I422TOABGRROW_NEON
@@ -404,6 +325,7 @@ extern "C" {
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
@@ -426,25 +348,25 @@ extern "C" {
#define HAS_ARGBSEPIAROW_NEON
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
#define HAS_SOBELROW_NEON
#define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXYROW_NEON
#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON
-#define HAS_INTERPOLATEROW_NEON
-// TODO(fbarchard): Investigate neon unittest failure.
-// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
+ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_I422TOABGRROW_MIPS_DSPR2
#define HAS_I422TOARGBROW_MIPS_DSPR2
#define HAS_I422TOBGRAROW_MIPS_DSPR2
-#define HAS_INTERPOLATEROWS_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
#define HAS_MIRRORROW_MIPS_DSPR2
#define HAS_MIRRORUVROW_MIPS_DSPR2
#define HAS_SPLITUVROW_MIPS_DSPR2
@@ -453,6 +375,7 @@ extern "C" {
#if defined(_MSC_VER) && !defined(__CLR_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
typedef __declspec(align(16)) int16 vec16[8];
typedef __declspec(align(16)) int32 vec32[4];
typedef __declspec(align(16)) int8 vec8[16];
@@ -469,20 +392,34 @@ typedef __declspec(align(32)) uint8 ulvec8[32];
#elif defined(__GNUC__)
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
typedef int16 __attribute__((vector_size(16))) vec16;
typedef int32 __attribute__((vector_size(16))) vec32;
typedef int8 __attribute__((vector_size(16))) vec8;
typedef uint16 __attribute__((vector_size(16))) uvec16;
typedef uint32 __attribute__((vector_size(16))) uvec32;
typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
#else
#define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
typedef int16 vec16[8];
typedef int32 vec32[4];
typedef int8 vec8[16];
typedef uint16 uvec16[8];
typedef uint32 uvec32[4];
typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
#endif
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
@@ -492,24 +429,16 @@ typedef uint8 uvec8[16];
#endif
// NaCL macros for GCC x86 and x64.
-
-// TODO(nfullagar): When pepper_33 toolchain is distributed, default to
-// NEW_BINUTILS and remove all BUNDLEALIGN occurances.
#if defined(__native_client__)
#define LABELALIGN ".p2align 5\n"
#else
-#define LABELALIGN ".p2align 2\n"
+#define LABELALIGN
#endif
#if defined(__native_client__) && defined(__x86_64__)
-#if defined(NEW_BINUTILS)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
#define BUNDLELOCK ".bundle_lock\n"
#define BUNDLEUNLOCK ".bundle_unlock\n"
-#define BUNDLEALIGN "\n"
-#else
-#define BUNDLELOCK "\n"
-#define BUNDLEUNLOCK "\n"
-#define BUNDLEALIGN ".p2align 5\n"
-#endif
#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
#define MEMLEA(offset, base) #offset "(%q" #base ")"
@@ -534,8 +463,19 @@ typedef uint8 uvec8[16];
"lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
#opcode " (%%r15,%%r14),%" #arg "\n" \
BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+ BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+ BUNDLEUNLOCK
#else // defined(__native_client__) && defined(__x86_64__)
-#define BUNDLEALIGN "\n"
+#define NACL_R14
+#define BUNDLEALIGN
#define MEMACCESS(base) "(%" #base ")"
#define MEMACCESS2(offset, base) #offset "(%" #base ")"
#define MEMLEA(offset, base) #offset "(%" #base ")"
@@ -551,14 +491,19 @@ typedef uint8 uvec8[16];
#opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
#define MEMOPARG(opcode, offset, base, index, scale, arg) \
#opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+ #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+ #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
#else
-#define MEMACCESS(base) "\n"
+#define MEMACCESS(base)
#endif
#endif
@@ -651,13 +596,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
@@ -736,16 +674,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width);
void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -807,15 +735,11 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
void ARGBToUV444Row_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV422Row_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
@@ -825,6 +749,8 @@ void ARGBToUV422Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV411Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
@@ -832,6 +758,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width);
@@ -843,9 +773,12 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width);
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
@@ -853,10 +786,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix);
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
- uint8* dst_v, int pix);
void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -874,8 +803,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width);
void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
@@ -884,11 +811,14 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_16_C(const uint16* src, uint16* dst, int count);
@@ -900,15 +830,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void SetRow_X86(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height);
-void SetRow_NEON(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
- int dst_stride, int height);
-void SetRow_C(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
- int height);
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
@@ -921,8 +853,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
@@ -975,6 +905,10 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -988,8 +922,10 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+ const uint8* dither8x8, int pix);
+
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
@@ -1032,6 +968,11 @@ void YUY2ToARGBRow_C(const uint8* src_yuy2,
void UYVYToARGBRow_C(const uint8* src_uyvy,
uint8* dst_argb,
int width);
+void J422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToBGRARow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1072,14 +1013,26 @@ void I422ToRGB565Row_C(const uint8* src_y,
const uint8* src_v,
uint8* dst_rgb565,
int width);
-void YToARGBRow_C(const uint8* src_y,
- uint8* dst_argb,
- int width);
void I422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
+void I422ToBGRARow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I444ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1103,6 +1056,14 @@ void NV21ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@@ -1111,12 +1072,31 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
int width);
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToBGRARow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1137,17 +1117,31 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
-// RGB24/RAW are unaligned.
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToRGB24Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1158,56 +1152,26 @@ void I422ToRAWRow_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_raw,
int width);
-
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width);
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width);
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width);
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- int width);
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
- int width);
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width);
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width);
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_bgra,
- int width);
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_abgr,
- int width);
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- int width);
void I422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1231,6 +1195,14 @@ void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@@ -1239,12 +1211,26 @@ void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
int width);
void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1265,17 +1251,31 @@ void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_rgba,
int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
int width);
-// RGB24/RAW are unaligned.
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1286,15 +1286,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
+
+void YToARGBRow_C(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
void YToARGBRow_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
+void YToARGBRow_AVX2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
+void YToARGBRow_Any_AVX2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
@@ -1365,6 +1375,10 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1489,12 +1503,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix);
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
@@ -1530,12 +1538,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix);
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
@@ -1568,28 +1570,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix);
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix);
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix);
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix);
-
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
- uint16* dst_uv, int pix);
-
-void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix);
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix);
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix);
-void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix);
-void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix);
void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
@@ -1736,15 +1716,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
@@ -1757,9 +1731,9 @@ void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
-void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
ptrdiff_t src_stride_ptr,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h
index a3bc07e0fd6..102158d1ab2 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h
@@ -34,6 +34,7 @@ void ScalePlane(const uint8* src, int src_stride,
int dst_width, int dst_height,
enum FilterMode filtering);
+LIBYUV_API
void ScalePlane_16(const uint16* src, int src_stride,
int src_width, int src_height,
uint16* dst, int dst_stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
index 3c495424f1a..27aa04b2202 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
@@ -44,21 +44,13 @@ extern "C" {
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
-#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__aarch64__) || defined(LIBYUV_NEON))
-/* #define HAS_SCALEROWDOWN2_NEON */
-/* #define HAS_SCALEROWDOWN4_NEON */
-/* #define HAS_SCALEROWDOWN34_NEON */
-/* #define HAS_SCALEROWDOWN38_NEON */
-/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
-/* #define HAS_SCALEARGBROWDOWN2_NEON */
#endif
// The following are available on Mips platforms:
@@ -208,15 +200,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -267,10 +250,10 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Row functions.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h
index 73a7f1b019b..9236a7fa1f5 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1059
+#define LIBYUV_VERSION 1305
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h
index 91acc2ffcf9..cb6582f24dc 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h
@@ -62,7 +62,7 @@ enum FourCC {
// 2 Secondary YUV formats: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -75,7 +75,7 @@ enum FourCC {
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
- // 4 Secondary RGB formats: 4 Bayer Patterns.
+ // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc
index dc715e0199c..f84a08ee6c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc
@@ -19,6 +19,7 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
+#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
@@ -78,6 +79,54 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
return seed;
}
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
+ return FOURCC_BGRA;
+ }
+ if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
+ return FOURCC_ARGB;
+ }
+ argb += 8;
+ }
+ if (width & 1) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ }
+ return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+ uint32 fourcc = 0;
+ int h;
+
+ // Coalesce rows.
+ if (stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ stride_argb = 0;
+ }
+ for (h = 0; h < height && fourcc == 0; ++h) {
+ fourcc = ARGBDetectRow_C(argb, width);
+ argb += stride_argb;
+ }
+ return fourcc;
+}
+
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@@ -114,8 +163,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc
index 55052c0eecb..ef006ec41cd 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc
@@ -16,7 +16,8 @@ namespace libyuv {
extern "C" {
#endif
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
@@ -56,46 +57,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
-#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
-
- ".p2align 2 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n"
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %2, %2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "bgt 1b \n"
-
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
- return sse;
-}
-
-#endif // __ARM_NEON__
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 00000000000..cc078f84cd8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %2, %2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc
index ac361190e88..247cb33bbaa 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc
@@ -25,11 +25,10 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
- "movdqa " MEMACCESS(1) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
- "sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
@@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
@@ -53,11 +53,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(sse) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); // NOLINT
return sse;
}
@@ -124,13 +120,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
- "sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
@@ -143,9 +139,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"m"(kHashMul2), // %7
"m"(kHashMul3) // %8
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
); // NOLINT
return hash;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc
index 99831651f5f..e99009a21df 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc
@@ -27,13 +27,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pxor xmm0, xmm0
pxor xmm5, xmm5
- align 4
wloop:
- movdqa xmm1, [eax]
+ movdqu xmm1, [eax]
lea eax, [eax + 16]
- movdqa xmm2, [edx]
+ movdqu xmm2, [edx]
lea edx, [edx + 16]
- sub ecx, 16
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
@@ -45,6 +43,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
+ sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
@@ -70,12 +69,10 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
- align 4
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
- sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
@@ -85,6 +82,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
+ sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
@@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
- align 4
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
@@ -170,7 +167,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
- sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
@@ -178,6 +174,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
+ sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
@@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
- align 4
wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
@@ -209,13 +205,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
- sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
+ sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc
index a8e294f47fc..41696c18f87 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc
@@ -188,17 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) {
int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) &&
- IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@@ -207,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
}
#endif
#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
@@ -283,20 +280,15 @@ static int X420ToI420(const uint8* src_y,
src_stride_uv = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_SPLITUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- SplitUVRow = SplitUVRow_SSE2;
- }
+ SplitUVRow = SplitUVRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
SplitUVRow = SplitUVRow_AVX2;
@@ -304,7 +296,7 @@ static int X420ToI420(const uint8* src_y,
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_NEON;
@@ -312,15 +304,13 @@ static int X420ToI420(const uint8* src_y,
}
#endif
#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+ IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+ IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
- if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
- IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
- IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
- SplitUVRow = SplitUVRow_MIPS_DSPR2;
- }
+ SplitUVRow = SplitUVRow_MIPS_DSPR2;
}
}
#endif
@@ -391,125 +381,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
width, height);
}
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- int halfheight;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
- void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) = YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
- YUY2ToYRow_C;
- if (!src_y || !src_yuy2 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
- YUY2ToYRow = YUY2ToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
- YUY2ToYRow = YUY2ToYRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
- if (IS_ALIGNED(width, 16)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- YUY2ToUV422Row = YUY2ToUV422Row_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
-
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- src_yuy2 += src_stride_yuy2;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- CopyRow(src_y, dst_y, width);
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- }
- return 0;
-}
-
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
@@ -529,23 +400,17 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = -src_stride_yuy2;
}
#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@@ -555,11 +420,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width >= 16) {
- YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
- }
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUVRow = YUY2ToUVRow_NEON;
@@ -602,23 +465,17 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = -src_stride_uyvy;
}
#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
UYVYToUVRow = UYVYToUVRow_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUVRow = UYVYToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
}
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
UYVYToUVRow = UYVYToUVRow_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@@ -628,11 +485,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
#if defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width >= 16) {
- UYVYToUVRow = UYVYToUVRow_Any_NEON;
- }
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUVRow = UYVYToUVRow_NEON;
@@ -680,23 +535,17 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@@ -706,7 +555,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -714,7 +563,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
@@ -761,34 +610,31 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
src_bgra = src_bgra + (height - 1) * src_stride_bgra;
src_stride_bgra = -src_stride_bgra;
}
-#if defined(HAS_BGRATOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
BGRAToYRow = BGRAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
- BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
- BGRAToUVRow = BGRAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- BGRAToYRow = BGRAToYRow_SSSE3;
- }
- }
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ BGRAToYRow = BGRAToYRow_SSSE3;
}
}
-#elif defined(HAS_BGRATOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
BGRAToYRow = BGRAToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
BGRAToYRow = BGRAToYRow_NEON;
}
- if (width >= 16) {
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
BGRAToUVRow = BGRAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_NEON;
}
}
- }
#endif
for (y = 0; y < height - 1; y += 2) {
@@ -830,32 +676,29 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
-#if defined(HAS_ABGRTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
- ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ABGRToYRow = ABGRToYRow_SSSE3;
- }
- }
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
}
}
-#elif defined(HAS_ABGRTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_NEON;
}
- if (width >= 16) {
- ABGRToUVRow = ABGRToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
}
}
#endif
@@ -899,32 +742,29 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
src_rgba = src_rgba + (height - 1) * src_stride_rgba;
src_stride_rgba = -src_stride_rgba;
}
-#if defined(HAS_RGBATOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
RGBAToYRow = RGBAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
- RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
- RGBAToUVRow = RGBAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- RGBAToYRow = RGBAToYRow_SSSE3;
- }
- }
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ RGBAToYRow = RGBAToYRow_SSSE3;
}
}
-#elif defined(HAS_RGBATOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
RGBAToYRow = RGBAToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGBAToYRow = RGBAToYRow_NEON;
}
- if (width >= 16) {
- RGBAToUVRow = RGBAToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
}
}
#endif
@@ -978,22 +818,23 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#if defined(HAS_RGB24TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToYRow = RGB24ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToYRow = RGB24ToYRow_NEON;
}
- if (width >= 16) {
- RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVRow = RGB24ToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_RGB24TOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
-#else // HAS_RGB24TOYROW_NEON
-
+#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
@@ -1001,7 +842,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1009,17 +850,13 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
-#endif // HAS_RGB24TOYROW_NEON
{
#if !defined(HAS_RGB24TOYROW_NEON)
@@ -1095,22 +932,23 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
#if defined(HAS_RAWTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
RAWToYRow = RAWToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToYRow = RAWToYRow_NEON;
}
- if (width >= 16) {
- RAWToUVRow = RAWToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RAWToUVRow = RAWToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_RAWTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_NEON;
}
}
-#else // HAS_RAWTOYROW_NEON
-
+#endif
#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
@@ -1118,7 +956,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1126,17 +964,13 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
-#endif // HAS_RAWTOYROW_NEON
{
// Allocate 2 rows of ARGB.
@@ -1210,22 +1044,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#if defined(HAS_RGB565TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToYRow = RGB565ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB565ToYRow = RGB565ToYRow_NEON;
}
- if (width >= 16) {
- RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToUVRow = RGB565ToUVRow_NEON;
- }
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
}
}
#else // HAS_RGB565TOYROW_NEON
#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
@@ -1233,7 +1065,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1241,13 +1073,10 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -1327,22 +1156,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
#if defined(HAS_ARGB1555TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToYRow = ARGB1555ToYRow_NEON;
}
- if (width >= 16) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
- }
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
}
}
#else // HAS_ARGB1555TOYROW_NEON
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
@@ -1350,7 +1177,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1358,13 +1185,10 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -1445,22 +1269,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#if defined(HAS_ARGB4444TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToYRow = ARGB4444ToYRow_NEON;
}
- if (width >= 16) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
- }
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
}
}
#else // HAS_ARGB4444TOYROW_NEON
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
@@ -1468,7 +1290,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1476,13 +1298,10 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc
index ac0bc3d156f..66f7660793a 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc
@@ -11,7 +11,6 @@
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
@@ -79,17 +78,15 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I444TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
- }
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_I444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I444ToARGBRow = I444ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_NEON;
@@ -141,18 +138,15 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -160,7 +154,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
@@ -221,17 +215,15 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I411TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I411ToARGBRow = I411ToARGBRow_SSSE3;
- }
+ I411ToARGBRow = I411ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_I411TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I411ToARGBRow = I411ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I411ToARGBRow = I411ToARGBRow_NEON;
@@ -276,15 +268,23 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_YTOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
YToARGBRow = YToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2;
}
}
-#elif defined(HAS_YTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_YTOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YToARGBRow = YToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ YToARGBRow = YToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
YToARGBRow = YToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON;
@@ -326,17 +326,15 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_I400TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I400ToARGBRow = I400ToARGBRow_SSE2;
- }
+ I400ToARGBRow = I400ToARGBRow_SSE2;
}
}
-#elif defined(HAS_I400TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I400ToARGBRow = I400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_NEON;
@@ -447,15 +445,15 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
src_stride_rgb24 = dst_stride_argb = 0;
}
#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_RGB24TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToARGBRow = RGB24ToARGBRow_NEON;
@@ -497,15 +495,15 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
src_stride_raw = dst_stride_argb = 0;
}
#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
}
}
-#elif defined(HAS_RAWTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToARGBRow = RAWToARGBRow_NEON;
@@ -547,15 +545,15 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
src_stride_rgb565 = dst_stride_argb = 0;
}
#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
}
}
-#elif defined(HAS_RGB565TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_NEON;
@@ -597,15 +595,15 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
src_stride_argb1555 = dst_stride_argb = 0;
}
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
}
}
-#elif defined(HAS_ARGB1555TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
@@ -647,15 +645,15 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
src_stride_argb4444 = dst_stride_argb = 0;
}
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
}
}
-#elif defined(HAS_ARGB4444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
@@ -693,17 +691,23 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -744,18 +748,23 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV21TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV21ToARGBRow = NV21ToARGBRow_SSSE3;
- }
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_NV21TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_NEON;
@@ -795,17 +804,23 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -852,19 +867,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = dst_stride_argb = 0;
}
#if defined(HAS_YUY2TOARGBROW_SSSE3)
- // Posix is 16, Windows is 8.
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
- }
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_YUY2TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YUY2ToARGBRow = YUY2ToARGBRow_NEON;
@@ -905,19 +924,23 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = dst_stride_argb = 0;
}
#if defined(HAS_UYVYTOARGBROW_SSSE3)
- // Posix is 16, Windows is 8.
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_SSSE3;
- }
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
}
}
-#elif defined(HAS_UYVYTOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToARGBRow = UYVYToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
UYVYToARGBRow = UYVYToARGBRow_NEON;
@@ -932,6 +955,152 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
return 0;
}
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*J422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = J422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J422ToARGBRow = J422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J422ToARGBRow = J422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*J422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = J422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J422ToARGBRow = J422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J422ToARGBRow = J422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc
index c1a2f62f020..b743cde264b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc
@@ -13,7 +13,6 @@
#include "libyuv/basic_types.h"
#include "libyuv/convert.h" // For I420Copy
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/scale.h" // For ScalePlane()
@@ -174,14 +173,15 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
}
#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_SSE2;
}
}
-#elif defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -220,14 +220,15 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
dst_stride_yuy2 = -dst_stride_yuy2;
}
#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_SSE2;
}
}
-#elif defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -280,14 +281,15 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
}
#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_SSE2;
}
}
-#elif defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -326,14 +328,15 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
dst_stride_uyvy = -dst_stride_uyvy;
}
#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_SSE2;
}
}
-#elif defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -397,20 +400,15 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
src_stride_u = src_stride_v = dst_stride_uv = 0;
}
#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
+ MergeUVRow_ = MergeUVRow_SSE2;
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
@@ -418,7 +416,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_NEON;
@@ -476,18 +474,15 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -495,7 +490,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
@@ -548,23 +543,30 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
dst_stride_bgra = -dst_stride_bgra;
}
#if defined(HAS_I422TOBGRAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToBGRARow = I422ToBGRARow_AVX2;
}
}
-#elif defined(HAS_I422TOBGRAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_NEON;
}
}
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@@ -610,17 +612,23 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
dst_stride_abgr = -dst_stride_abgr;
}
#if defined(HAS_I422TOABGRROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_AVX2;
}
}
-#elif defined(HAS_I422TOABGRROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToABGRRow = I422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_NEON;
@@ -664,17 +672,23 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
dst_stride_rgba = -dst_stride_rgba;
}
#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
}
}
-#elif defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGBARow = I422ToRGBARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_NEON;
@@ -718,14 +732,15 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
dst_stride_rgb24 = -dst_stride_rgb24;
}
#if defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
}
-#elif defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRGB24Row = I422ToRGB24Row_NEON;
@@ -769,14 +784,15 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
dst_stride_raw = -dst_stride_raw;
}
#if defined(HAS_I422TORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
}
-#elif defined(HAS_I422TORAWROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToRAWRow = I422ToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_NEON;
@@ -820,14 +836,23 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
dst_stride_argb1555 = -dst_stride_argb1555;
}
#if defined(HAS_I422TOARGB1555ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
}
}
-#elif defined(HAS_I422TOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGB1555Row = I422ToARGB1555Row_NEON;
@@ -872,14 +897,23 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
dst_stride_argb4444 = -dst_stride_argb4444;
}
#if defined(HAS_I422TOARGB4444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
}
}
-#elif defined(HAS_I422TOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGB4444Row = I422ToARGB4444Row_NEON;
@@ -923,14 +957,23 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGB565Row = I422ToRGB565Row_SSSE3;
}
}
-#elif defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRGB565Row = I422ToRGB565Row_NEON;
@@ -1054,38 +1097,6 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
break;
- case FOURCC_BGGR:
- r = I420ToBayerBGGR(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_GBRG:
- r = I420ToBayerGBRG(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_GRBG:
- r = I420ToBayerGRBG(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_RGGB:
- r = I420ToBayerRGGB(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
case FOURCC_I400:
r = I400Copy(y, y_stride,
dst_sample,
@@ -1116,7 +1127,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
}
- // TODO(fbarchard): Add M420 and Q420.
+ // TODO(fbarchard): Add M420.
// Triplanar formats
// TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420:
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
index de461ddb046..dc2186a6a08 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
@@ -12,7 +12,6 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/row.h"
@@ -51,17 +50,15 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUV444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_SSSE3;
- }
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
}
}
-#elif defined(HAS_ARGBTOUV444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToUV444Row = ARGBToUV444Row_NEON;
@@ -69,19 +66,16 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -130,17 +124,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
@@ -149,18 +141,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -209,19 +198,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
@@ -229,7 +214,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -237,7 +222,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUV411ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
if (IS_ALIGNED(width, 32)) {
ARGBToUV411Row = ARGBToUV411Row_NEON;
@@ -281,22 +266,17 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -304,7 +284,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
@@ -312,18 +292,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
+ MergeUVRow_ = MergeUVRow_SSE2;
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
@@ -331,7 +308,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_NEON;
@@ -388,22 +365,17 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -411,7 +383,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
@@ -419,18 +391,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
+ MergeUVRow_ = MergeUVRow_SSE2;
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
@@ -438,7 +407,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_NEON;
@@ -500,17 +469,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_yuy2 = 0;
}
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
@@ -518,17 +485,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -537,14 +502,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_SSE2;
}
}
-#elif defined(HAS_I422TOYUY2ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -602,17 +568,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_uyvy = 0;
}
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
@@ -620,17 +584,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -639,14 +601,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_SSE2;
}
}
-#elif defined(HAS_I422TOUYVYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -697,19 +660,15 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
@@ -717,7 +676,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@@ -773,14 +732,15 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_rgb24 = 0;
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
}
}
-#elif defined(HAS_ARGBTORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB24Row = ARGBToRGB24Row_NEON;
@@ -820,14 +780,15 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_raw = 0;
}
#if defined(HAS_ARGBTORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTORAWROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRAWRow = ARGBToRAWRow_NEON;
@@ -843,6 +804,46 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
return 0;
}
+static const uint8 kDither8x8[64] = {
+ 0, 128, 32, 160, 8, 136, 40, 168,
+ 192, 64, 224, 96, 200, 72, 232, 104,
+ 48, 176, 16, 144, 56, 184, 24, 152,
+ 240, 112, 208, 80, 248, 120, 216, 88,
+ 12, 140, 44, 172, 4, 132, 36, 164,
+ 204, 76, 236, 108, 196, 68, 228, 100,
+ 60, 188, 28, 156, 52, 180, 20, 148,
+ 252, 124, 220, 92, 244, 116, 212, 84,
+};
+
+// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ const uint8* dither8x8, int width, int height) {
+ int y;
+ void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+ const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (!dither8x8) {
+ dither8x8 = kDither8x8;
+
+ }
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+ dither8x8 + ((y & 7) << 3), width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
// Convert ARGB To RGB565.
LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
@@ -867,15 +868,23 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_rgb565 = 0;
}
#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
}
-#elif defined(HAS_ARGBTORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565Row = ARGBToRGB565Row_NEON;
@@ -915,15 +924,23 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb1555 = 0;
}
#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
}
}
-#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
@@ -963,15 +980,23 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb4444 = 0;
}
#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
}
}
-#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
@@ -1011,23 +1036,17 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
- ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
- if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
- }
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
@@ -1035,7 +1054,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
@@ -1043,7 +1062,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON;
@@ -1067,6 +1086,80 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
return 0;
}
+// ARGB little endian (bgra in memory) to J422
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUVJ422Row_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
// Convert ARGB to J400.
LIBYUV_API
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
@@ -1091,19 +1184,15 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_yj = 0;
}
#if defined(HAS_ARGBTOYJROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
@@ -1111,7 +1200,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc
index 1b228a7b4d9..af829fbd32b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc
@@ -11,7 +11,6 @@
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
@@ -144,36 +143,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
- // TODO(fbarchard): Support cropping Bayer by odd numbers
- // by adjusting fourcc.
- case FOURCC_BGGR:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerBGGRToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_GBRG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGBRGToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_GRBG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGRBGToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
- case FOURCC_RGGB:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerRGGBToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
-
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToARGB(src, src_width,
@@ -205,15 +174,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
-// case FOURCC_Q420:
-// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-// src_width + crop_x * 2;
-// r = Q420ToARGB(src, src_width * 3,
-// src_uv, src_width * 3,
-// crop_argb, argb_stride,
-// crop_width, inv_crop_height);
-// break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
@@ -241,6 +201,25 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_width, inv_crop_height);
break;
}
+
+ case FOURCC_J420: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_I422:
case FOURCC_YV16: {
const uint8* src_y = sample + src_width * crop_y + crop_x;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc
index 7b194fff721..5e75369b55a 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc
@@ -12,7 +12,6 @@
#include "libyuv/convert.h"
-#include "libyuv/format_conversion.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
@@ -173,40 +172,6 @@ int ConvertToI420(const uint8* sample,
v, v_stride,
crop_width, inv_crop_height);
break;
- // TODO(fbarchard): Support cropping Bayer by odd numbers
- // by adjusting fourcc.
- case FOURCC_BGGR:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerBGGRToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_GBRG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGBRGToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_GRBG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGRBGToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- case FOURCC_RGGB:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerRGGBToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToI420(src, src_width,
@@ -218,7 +183,8 @@ int ConvertToI420(const uint8* sample,
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ src_uv = sample + (src_width * src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
@@ -228,7 +194,8 @@ int ConvertToI420(const uint8* sample,
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ src_uv = sample + (src_width * src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
// Call NV12 but with u and v parameters swapped.
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
@@ -245,17 +212,6 @@ int ConvertToI420(const uint8* sample,
v, v_stride,
crop_width, inv_crop_height);
break;
- case FOURCC_Q420:
- src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
- src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
- src_width + crop_x * 2;
- r = Q420ToI420(src, src_width * 3,
- src_uv, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc
index 8f8a403ee3e..1efa2652581 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc
@@ -52,7 +52,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER) && !defined(__clang__)
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
-#elif defined(_M_IX86)
+#endif
+#if defined(_M_IX86)
__asm {
mov eax, info_eax
mov ecx, info_ecx
@@ -98,13 +99,15 @@ int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
-#elif defined(_M_IX86) && defined(_MSC_VER)
+#endif
+#if defined(_M_IX86) && defined(_MSC_VER)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
-#elif defined(__i386__) || defined(__x86_64__)
+#endif
+#if defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(_MSC_VER)
return((xcr0 & 6) == 6); // Is ymm saved?
@@ -135,6 +138,12 @@ int ArmCpuCaps(const char* cpuinfo_name) {
fclose(f);
return kCpuHasNEON;
}
+ // aarch64 uses asimd for Neon.
+ p = strstr(cpuinfo_line, " asimd");
+ if (p && (p[6] == ' ' || p[6] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
}
}
fclose(f);
@@ -240,7 +249,8 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info_ &= ~kCpuHasFMA3;
}
-#elif defined(__mips__) && defined(__linux__)
+#endif
+#if defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2)
@@ -257,7 +267,8 @@ int InitCpuFlags(void) {
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
-#elif defined(__arm__) || defined(__aarch64__)
+#endif
+#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
@@ -266,7 +277,8 @@ int InitCpuFlags(void) {
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
-#elif defined(__aarch64__)
+#endif
+#if defined(__aarch64__)
cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc
deleted file mode 100644
index 3c173715320..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
- return (uint32)(select0) |
- (uint32)((select1 + 4) << 8) |
- (uint32)((select0 + 8) << 16) |
- (uint32)((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
- const int green_index,
- const int red_index,
- uint32 dst_fourcc_bayer,
- uint32* index_map) {
- // Now build a lookup table containing the indices for the four pixels in each
- // 2x2 Bayer grid.
- switch (dst_fourcc_bayer) {
- case FOURCC_BGGR:
- index_map[0] = GenerateSelector(blue_index, green_index);
- index_map[1] = GenerateSelector(green_index, red_index);
- break;
- case FOURCC_GBRG:
- index_map[0] = GenerateSelector(green_index, blue_index);
- index_map[1] = GenerateSelector(red_index, green_index);
- break;
- case FOURCC_RGGB:
- index_map[0] = GenerateSelector(red_index, green_index);
- index_map[1] = GenerateSelector(green_index, blue_index);
- break;
- case FOURCC_GRBG:
- index_map[0] = GenerateSelector(green_index, red_index);
- index_map[1] = GenerateSelector(blue_index, green_index);
- break;
- default:
- return -1; // Bad FourCC
- }
- return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer) {
- int y;
- const int blue_index = 0; // Offsets for ARGB format
- const int green_index = 1;
- const int red_index = 2;
- uint32 index_map[2];
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
- }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_NEON;
- }
- }
-#endif
- if (MakeSelectors(blue_index, green_index, red_index,
- dst_fourcc_bayer, index_map)) {
- return -1; // Bad FourCC
- }
-
- for (y = 0; y < height; ++y) {
- ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
- src_argb += src_stride_argb;
- dst_bayer += dst_stride_bayer;
- }
- return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 g = src_bayer0[1];
- uint8 r = src_bayer1[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = src_bayer0[0];
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = AVG(r, src_bayer1[1]);
- dst_argb[3] = 255U;
- dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer1[1];
- dst_argb[7] = 255U;
- g = src_bayer0[1];
- r = src_bayer1[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = src_bayer0[0];
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = AVG(r, src_bayer1[1]);
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer0[0];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer1[1];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 g = src_bayer0[1];
- uint8 b = src_bayer1[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = AVG(b, src_bayer1[1]);
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = src_bayer0[0];
- dst_argb[3] = 255U;
- dst_argb[4] = src_bayer1[1];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[7] = 255U;
- g = src_bayer0[1];
- b = src_bayer1[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = AVG(b, src_bayer1[1]);
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = src_bayer0[0];
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer1[1];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer0[0];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 b = src_bayer0[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = AVG(b, src_bayer0[1]);
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = src_bayer1[0];
- dst_argb[3] = 255U;
- dst_argb[4] = src_bayer0[1];
- dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_argb[7] = 255U;
- b = src_bayer0[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = AVG(b, src_bayer0[1]);
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = src_bayer1[0];
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer0[1];
- dst_argb[5] = src_bayer0[0];
- dst_argb[6] = src_bayer1[0];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 r = src_bayer0[1];
- int x;
- for (x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = src_bayer1[0];
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = AVG(r, src_bayer0[1]);
- dst_argb[3] = 255U;
- dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[6] = src_bayer0[1];
- dst_argb[7] = 255U;
- r = src_bayer0[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = src_bayer1[0];
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = AVG(r, src_bayer0[1]);
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer1[0];
- dst_argb[5] = src_bayer0[0];
- dst_argb[6] = src_bayer0[1];
- dst_argb[7] = 255U;
- }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- uint32 src_fourcc_bayer) {
- int y;
- void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- switch (src_fourcc_bayer) {
- case FOURCC_BGGR:
- BayerRow0 = BayerRowBG;
- BayerRow1 = BayerRowGR;
- break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
- break;
- case FOURCC_GRBG:
- BayerRow0 = BayerRowGR;
- BayerRow1 = BayerRowBG;
- break;
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
- default:
- return -1; // Bad FourCC
- }
-
- for (y = 0; y < height - 1; y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
- BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- dst_argb + dst_stride_argb, width);
- src_bayer += src_stride_bayer * 2;
- dst_argb += dst_stride_argb * 2;
- }
- if (height & 1) {
- BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
- }
- return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- uint32 src_fourcc_bayer) {
- void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
-
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- // Negative height means invert the image.
- if (height < 0) {
- int halfheight;
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
- }
-#elif defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
-
- switch (src_fourcc_bayer) {
- case FOURCC_BGGR:
- BayerRow0 = BayerRowBG;
- BayerRow1 = BayerRowGR;
- break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
- break;
- case FOURCC_GRBG:
- BayerRow0 = BayerRowGR;
- BayerRow1 = BayerRowBG;
- break;
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
- default:
- return -1; // Bad FourCC
- }
-
- {
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
- int y;
- for (y = 0; y < height - 1; y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, row, width);
- BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
- src_bayer += src_stride_bayer * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- BayerRow0(src_bayer, src_stride_bayer, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
- free_aligned_buffer_64(row);
- }
- return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer) {
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
- const int blue_index = 0; // Offsets for ARGB format
- const int green_index = 1;
- const int red_index = 2;
- uint32 index_map[2];
- // Negative height means invert the image.
- if (height < 0) {
- int halfheight;
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
- I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
- }
-#endif
-
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
- }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_NEON;
- }
- }
-#endif
-
- if (MakeSelectors(blue_index, green_index, red_index,
- dst_fourcc_bayer, index_map)) {
- return -1; // Bad FourCC
- }
- {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- int y;
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row, width);
- ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
- dst_bayer += dst_stride_bayer;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- free_aligned_buffer_64(row);
- }
- return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER) \
-LIBYUV_API \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
- uint8* dst_y, int dst_stride_y, \
- uint8* dst_u, int dst_stride_u, \
- uint8* dst_v, int dst_stride_v, \
- int width, int height) { \
- return BayerToI420(src_bayer, src_stride_bayer, \
- dst_y, dst_stride_y, \
- dst_u, dst_stride_u, \
- dst_v, dst_stride_v, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
- const uint8* src_u, int src_stride_u, \
- const uint8* src_v, int src_stride_v, \
- uint8* dst_bayer, int dst_stride_bayer, \
- int width, int height) { \
- return I420ToBayer(src_y, src_stride_y, \
- src_u, src_stride_u, \
- src_v, src_stride_v, \
- dst_bayer, dst_stride_bayer, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
- uint8* dst_bayer, int dst_stride_bayer, \
- int width, int height) { \
- return ARGBToBayer(src_argb, src_stride_argb, \
- dst_bayer, dst_stride_bayer, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
- uint8* dst_argb, int dst_stride_argb, \
- int width, int height) { \
- return BayerToARGB(src_bayer, src_stride_bayer, \
- dst_argb, dst_stride_argb, \
- width, height, \
- FOURCC_##BAYER); \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc
index 23d22d099bb..40ce2f787a1 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc
@@ -10,15 +10,66 @@
#include "libyuv/mjpeg_decoder.h"
+#include <string.h> // For memchr.
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked) __declspec(align(16))
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // src
+ mov eax, [esp + 8] // val
+ mov ecx, [esp + 12] // count
+ repne scasb
+ jne sr99
+ mov eax, edi
+ sub eax, 1
+ mov edi, edx
+ ret
+
+ sr99:
+ mov eax, 0
+ mov edi, edx
+ ret
+ }
+}
+#endif
+
+// Helper function to scan for EOI marker.
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+ const uint8* end = sample + sample_size - 1;
+ const uint8* it = sample;
+ for (;;) {
+#ifdef ENABLE_SCASB
+ it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+ it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+ if (it == NULL) {
+ break;
+ }
+ if (it[1] == 0xd9) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ ++it; // Skip over current 0xff.
+ }
+ // ERROR: Invalid jpeg end code not found. Size sample_size
+ return LIBYUV_FALSE;
+}
+
// Helper function to validate the jpeg appears intact.
-// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
- size_t i;
+ const size_t kBackSearchSize = 1024;
if (sample_size < 64) {
// ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE;
@@ -27,17 +78,20 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
- for (i = sample_size - 2; i > 1;) {
- if (sample[i] != 0xd9) {
- if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image
- return LIBYUV_TRUE; // Success: Valid jpeg.
- }
- --i;
+ // Step over SOI marker.
+ sample += 2;
+ sample_size -= 2;
+
+ // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
+ if (sample_size > kBackSearchSize) {
+ if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
}
- --i;
+ // Reduce search size for forward search.
+ sample_size = sample_size - kBackSearchSize + 1;
}
- // ERROR: Invalid jpeg end code not found. Size sample_size
- return LIBYUV_FALSE;
+ return ScanEOI(sample, sample_size);
+
}
#ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc
index 3857008cae3..75ef775dde8 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc
@@ -41,16 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
if (src_y == dst_y && src_stride_y == dst_stride_y) {
return;
}
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@@ -59,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
@@ -90,15 +88,8 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
-#if defined(HAS_COPYROW_16_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_16_X86;
- }
-#endif
#if defined(HAS_COPYROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
}
#endif
@@ -239,25 +230,43 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
src_stride_y = -src_stride_y;
}
#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
}
#endif
#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MirrorRow = MirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- MirrorRow = MirrorRow_SSSE3;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+ MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
@@ -298,23 +307,17 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@@ -324,7 +327,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width >= 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
@@ -376,23 +379,17 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
}
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@@ -402,7 +399,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
#if defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width >= 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
@@ -497,22 +494,28 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
}
#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
- ARGBMirrorRow = ARGBMirrorRow_AVX2;
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
}
#endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
- ARGBMirrorRow = ARGBMirrorRow_NEON;
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
}
#endif
@@ -614,7 +617,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
@@ -622,7 +625,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@@ -630,7 +633,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_NEON;
@@ -680,7 +683,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAddRow = ARGBAddRow_SSE2;
@@ -688,7 +691,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAddRow = ARGBAddRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_AVX2;
@@ -696,7 +699,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBAddRow = ARGBAddRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_NEON;
@@ -741,7 +744,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSUBTRACTROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBSubtractRow = ARGBSubtractRow_SSE2;
@@ -749,7 +752,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_AVX2;
@@ -757,7 +760,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_NEON;
@@ -808,24 +811,31 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
}
-#if defined(HAS_I422TOBGRAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToBGRARow = I422ToBGRARow_Any_NEON;
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToBGRARow = I422ToBGRARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I422ToBGRARow = I422ToBGRARow_NEON;
+ I422ToBGRARow = I422ToBGRARow_AVX2;
}
}
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
+ I422ToBGRARow = I422ToBGRARow_NEON;
}
}
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@@ -879,20 +889,26 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
}
#if defined(HAS_I422TOABGRROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I422ToABGRRow = I422ToABGRRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_NEON;
}
}
-#elif defined(HAS_I422TOABGRROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_AVX2;
}
}
#endif
@@ -941,20 +957,26 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
}
#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_NEON;
}
}
-#elif defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
}
}
#endif
@@ -991,14 +1013,23 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_NV12TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
}
}
-#elif defined(HAS_NV12TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToRGB565Row = NV12ToRGB565Row_NEON;
@@ -1039,14 +1070,23 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_NV21TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
}
}
-#elif defined(HAS_NV21TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToRGB565Row = NV21ToRGB565Row_NEON;
@@ -1070,8 +1110,12 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value) {
int y;
- uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
- void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
+ void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
// Coalesce rows.
if (dst_stride_y == width) {
width *= height;
@@ -1079,21 +1123,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0;
}
#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- SetRow = SetRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SetRow = SetRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_NEON;
+ }
}
#endif
#if defined(HAS_SETROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- SetRow = SetRow_X86;
+ if (TestCpuFlag(kCpuHasX86)) {
+ SetRow = SetRow_Any_X86;
+ if (IS_ALIGNED(width, 4)) {
+ SetRow = SetRow_X86;
+ }
+ }
+#endif
+#if defined(HAS_SETROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ SetRow = SetRow_ERMS;
}
#endif
// Set plane
for (y = 0; y < height; ++y) {
- SetRow(dst_y, v32, width);
+ SetRow(dst_y, value, width);
dst_y += dst_stride_y;
}
}
@@ -1112,7 +1165,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v ||
- width <= 0 || height <= 0 ||
+ width <= 0 || height == 0 ||
x < 0 || y < 0 ||
value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 ||
@@ -1132,11 +1185,18 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y,
int width, int height,
uint32 value) {
+ int y;
+ void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
if (!dst_argb ||
- width <= 0 || height <= 0 ||
+ width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) {
return -1;
}
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
dst_argb += dst_y * dst_stride_argb + dst_x * 4;
// Coalesce rows.
if (dst_stride_argb == width * 4) {
@@ -1144,20 +1204,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
height = 1;
dst_stride_argb = 0;
}
-#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
- return 0;
+
+#if defined(HAS_ARGBSETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSetRow = ARGBSetRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_NEON;
+ }
}
#endif
-#if defined(HAS_SETROW_X86)
+#if defined(HAS_ARGBSETROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
- ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
- return 0;
+ ARGBSetRow = ARGBSetRow_X86;
}
#endif
- ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ ARGBSetRow(dst_argb, value, width);
+ dst_argb += dst_stride_argb;
+ }
return 0;
}
@@ -1197,9 +1263,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
@@ -1207,7 +1271,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1215,7 +1279,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@@ -1223,7 +1287,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
@@ -1263,7 +1327,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
@@ -1271,7 +1335,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@@ -1312,12 +1376,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
-#elif defined(HAS_ARGBGRAYROW_NEON)
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_NEON;
}
@@ -1350,11 +1413,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
-#elif defined(HAS_ARGBGRAYROW_NEON)
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_NEON;
}
@@ -1383,11 +1446,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBSEPIAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_SSSE3;
}
-#elif defined(HAS_ARGBSEPIAROW_NEON)
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
@@ -1425,11 +1488,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
}
-#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
@@ -1568,11 +1631,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBQUANTIZEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
}
-#elif defined(HAS_ARGBQUANTIZEROW_NEON)
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBQuantizeRow = ARGBQuantizeRow_NEON;
}
@@ -1743,12 +1806,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHADEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_SSE2;
}
-#elif defined(HAS_ARGBSHADEROW_NEON)
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBShadeRow = ARGBShadeRow_NEON;
}
@@ -1790,33 +1852,23 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -1824,19 +1876,19 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
@@ -1876,7 +1928,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_stride_bgra = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHUFFLEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBShuffleRow = ARGBShuffleRow_SSE2;
@@ -1884,19 +1936,15 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBShuffleRow = ARGBShuffleRow_SSSE3;
- }
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGBShuffleRow = ARGBShuffleRow_AVX2;
@@ -1904,7 +1952,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBShuffleRow = ARGBShuffleRow_NEON;
@@ -1947,8 +1995,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
@@ -1956,8 +2003,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
@@ -1965,7 +2011,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_NEON;
@@ -2048,8 +2094,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
#endif
@@ -2070,8 +2115,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
#endif
@@ -2093,8 +2137,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
#endif
@@ -2218,10 +2261,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
- IS_ALIGNED(width, 8)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
@@ -2264,10 +2304,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
- IS_ALIGNED(width, 8)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc
index 2ef3228cb80..5acaccfd89d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc
@@ -42,11 +42,7 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_MIRRORROW_UV_NEON
-void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE_WX8_NEON
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
@@ -55,7 +51,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
-#endif // defined(__ARM_NEON__)
+#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
@@ -194,31 +190,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + edi]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + edi]
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
- movdqa xmm4, [eax]
- movdqa xmm5, [eax + edi]
+ movdqu xmm4, [eax]
+ movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
- movdqa xmm6, [eax]
- movdqa xmm7, [eax + edi]
+ movdqu xmm6, [eax]
+ movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
- movdqa [esp], xmm5 // backup xmm5
+ movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
@@ -239,8 +235,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
- movdqa xmm5, [esp] // restore xmm5
- movdqa [esp], xmm6 // backup xmm6
+ movdqu xmm5, [esp] // restore xmm5
+ movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
@@ -251,7 +247,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
- movdqa xmm6, [esp] // restore xmm6
+ movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
@@ -296,7 +292,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
ret
}
}
-#elif !defined(LIBYUV_DISABLE_X86) && \
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
@@ -379,10 +376,8 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc"
- #if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- #endif
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -411,31 +406,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
- "movdqa (%eax),%xmm0 \n"
- "movdqa (%eax,%edi,1),%xmm1 \n"
+ "movdqu (%eax),%xmm0 \n"
+ "movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
- "movdqa (%eax),%xmm2 \n"
- "movdqa (%eax,%edi,1),%xmm3 \n"
+ "movdqu (%eax),%xmm2 \n"
+ "movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
- "movdqa (%eax),%xmm4 \n"
- "movdqa (%eax,%edi,1),%xmm5 \n"
+ "movdqu (%eax),%xmm4 \n"
+ "movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
- "movdqa (%eax),%xmm6 \n"
- "movdqa (%eax,%edi,1),%xmm7 \n"
+ "movdqu (%eax),%xmm6 \n"
+ "movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm5,(%esp) \n"
+ "movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
@@ -455,8 +450,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
- "movdqa (%esp),%xmm5 \n"
- "movdqa %xmm6,(%esp) \n"
+ "movdqu (%esp),%xmm5 \n"
+ "movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
@@ -465,7 +460,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
- "movdqa (%esp),%xmm6 \n"
+ "movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
@@ -514,7 +509,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"ret \n"
#endif
);
-#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
@@ -525,38 +521,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
// First round of bit swap.
".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3),%%xmm1 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa (%0),%%xmm2 \n"
+ "movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqa (%0,%3),%%xmm3 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
- "movdqa (%0),%%xmm4 \n"
+ "movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqa (%0,%3),%%xmm5 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
- "movdqa (%0),%%xmm6 \n"
+ "movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqa (%0,%3),%%xmm7 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
@@ -666,29 +662,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
// First round of bit swap.
".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%4),%%xmm1 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa (%0,%4),%%xmm3 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa (%0,%4),%%xmm5 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%4),%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
@@ -818,9 +814,7 @@ void TransposePlane(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
}
#endif
@@ -883,29 +877,38 @@ void RotatePlane180(const uint8* src, int src_stride,
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
}
#endif
#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- MirrorRow = MirrorRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MirrorRow = MirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- MirrorRow = MirrorRow_SSSE3;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
}
#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
@@ -913,21 +916,14 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@@ -935,6 +931,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
@@ -1010,13 +1011,13 @@ void TransposeUV(const uint8* src, int src_stride,
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
}
-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
-#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+#endif
+#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@@ -1084,12 +1085,13 @@ void RotateUV180(const uint8* src, int src_stride,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
MirrorRowUV = MirrorUVRow_NEON;
}
-#elif defined(HAS_MIRRORROW_UV_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_MIRRORROW_UV_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRowUV = MirrorUVRow_SSSE3;
}
-#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc
index ab0f9ce0707..b9673db15d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc
@@ -31,7 +31,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx,
@@ -50,13 +50,12 @@ static void ARGBTranspose(const uint8* src, int src_stride,
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
}
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
- IS_ALIGNED(src, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
}
#endif
@@ -102,38 +101,38 @@ void ARGBRotate180(const uint8* src, int src_stride,
void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
ARGBMirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_SSSE3;
- }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
- ARGBMirrorRow = ARGBMirrorRow_AVX2;
- }
-#endif
#if defined(HAS_ARGBMIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
- ARGBMirrorRow = ARGBMirrorRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
}
#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
- CopyRow = CopyRow_NEON;
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
}
#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86)) {
- CopyRow = CopyRow_X86;
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
}
#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@@ -141,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc
index d354e11faa6..a23a40fee34 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc
@@ -17,7 +17,8 @@ namespace libyuv {
extern "C" {
#endif
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
@@ -525,7 +526,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
-#endif
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 00000000000..92358af7ff6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+ { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ const uint8* src_temp = NULL;
+ int64 width64 = (int64) width; // Work around clang 3.4 warning.
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %3, %3, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v3.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v4.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v5.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v6.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v7.8b}, [%0] \n"
+
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v17.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v19.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v21.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v20.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v23.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v22.8b}, [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 8
+ "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %3, %3, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %3, %3, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %3, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %3, #4 \n"
+ "b.lt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[3], [%0] \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(4)
+ "ld1 {v2.16b}, [%4] \n"
+
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ MEMACCESS(0)
+ "st1 {v3.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v3.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v3.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v3.s}[3], [%0] \n"
+
+ "add %0, %2, #4 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[3], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 4
+ "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %3, %3, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %3, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[3], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[3], [%0] \n"
+
+ "trn2 v2.8b, v0.8b, v1.8b \n"
+ "trn1 v3.8b, v0.8b, v1.8b \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v3.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v2.8b}, [%0] \n"
+
+ "add %1, %1, #2 \n" // src += 2
+ "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %3, %3, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[0], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[1], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[2], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[3], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[4], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[5], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[6], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[7], [%1] \n"
+
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2] \n"
+
+ "4: \n"
+
+ : "+r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst), // %2
+ "+r"(width64) // %3
+ : "r"(&kVTbl4x4Transpose), // %4
+ "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+ { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+ 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ const uint8* src_temp = NULL;
+ int64 width64 = (int64) width; // Work around clang 3.4 warning.
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %4, %4, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v2.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v3.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v4.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v5.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v6.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v7.16b}, [%0] \n"
+
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v16.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v17.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v19.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.d}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.d}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v17.d}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "st1 {v20.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v22.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v21.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v23.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v20.d}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v22.d}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v21.d}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %4, %4, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %4, %4, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %4, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %4, #4 \n"
+ "b.lt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v3.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v4.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v5.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v6.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v7.8b}, [%0] \n"
+
+ MEMACCESS(8)
+ "ld1 {v30.16b}, [%8], #16 \n"
+ "ld1 {v31.16b}, [%8] \n"
+
+ "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
+ "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
+ "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
+ "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v16.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.s}[3], [%0], %6 \n"
+
+ "add %0, %2, #4 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[3], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "st1 {v17.s}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v17.s}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v17.s}[2], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v17.s}[3], [%0], %7 \n"
+
+ "add %0, %3, #4 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[2], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[3], [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 4 * 2
+ "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %4, %4, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %4, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[3], [%0] \n"
+
+ "trn1 v4.8b, v0.8b, v2.8b \n"
+ "trn2 v5.8b, v0.8b, v2.8b \n"
+ "trn1 v6.8b, v1.8b, v3.8b \n"
+ "trn2 v7.8b, v1.8b, v3.8b \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v4.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v6.d}[0], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "st1 {v5.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v7.d}[0], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 2 * 2
+ "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %4, %4, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[7], [%1] \n"
+
+ MEMACCESS(2)
+ "st1 {v0.d}[0], [%2] \n"
+ MEMACCESS(3)
+ "st1 {v1.d}[0], [%3] \n"
+
+ "4: \n"
+
+ : "+r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_b), // %3
+ "+r"(width64) // %4
+ : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
+ "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
+ "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+ "v30", "v31"
+ );
+}
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc
index ce8b3dad119..19340b3b74c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc
@@ -17,17 +17,14 @@ namespace libyuv {
extern "C" {
#endif
-// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
-// TODO(fbarchard): Consider 'any' functions handling odd alignment.
// YUV to RGB does multiple of 8 with SIMD and remainder with C.
#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, \
- const uint8* u_buf, \
- const uint8* v_buf, \
- uint8* rgb_buf, \
- int width) { \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* rgb_buf, int width) { \
int n = width & ~MASK; \
- I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
+ if (n > 0) { \
+ I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
+ } \
I420TORGB_C(y_buf + n, \
u_buf + (n >> UV_SHIFT), \
v_buf + (n >> UV_SHIFT), \
@@ -35,36 +32,59 @@ extern "C" {
}
#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C,
1, 4, 7)
-#endif // HAS_I422TOARGBROW_SSSE3
+#endif
#ifdef HAS_I444TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C,
0, 4, 7)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C,
2, 4, 7)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C,
1, 4, 7)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C,
1, 4, 7)
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C,
1, 4, 7)
-// I422ToRGB565Row_SSSE3 is unaligned.
YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
1, 2, 7)
YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
1, 2, 7)
-// I422ToRGB24Row_SSSE3 is unaligned.
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_J422TOARGBROW_SSSE3
+YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C,
+ 1, 4, 7)
+#endif
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
-#endif // HAS_I422TOARGBROW_AVX2
+#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C,
+ 1, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C,
+ 1, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C,
+ 1, 2, 7)
+#endif
#ifdef HAS_I422TOARGBROW_NEON
YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
@@ -79,214 +99,240 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-#endif // HAS_I422TOARGBROW_NEON
+#endif
#ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-#endif // HAS_I422TOYUY2ROW_NEON
+#endif
#ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
-#endif // HAS_I422TOUYVYROW_NEON
+#endif
#undef YANY
// Wrappers to handle odd width
-#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \
- void NAMEANY(const uint8* y_buf, \
- const uint8* uv_buf, \
- uint8* rgb_buf, \
- int width) { \
- int n = width & ~7; \
- NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
+#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
+ uint8* rgb_buf, int width) { \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
+ } \
NV12TORGB_C(y_buf + n, \
uv_buf + (n >> UV_SHIFT), \
- rgb_buf + n * BPP, width & 7); \
+ rgb_buf + n * BPP, width & MASK); \
}
#ifdef HAS_NV12TOARGBROW_SSSE3
-NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
- 0, 4)
-NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
- 0, 4)
-#endif // HAS_NV12TOARGBROW_SSSE3
+NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7)
+NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15)
+NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15)
+#endif
#ifdef HAS_NV12TOARGBROW_NEON
-NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
-NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
-#endif // HAS_NV12TOARGBROW_NEON
+NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7)
+NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7)
+#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
- 0, 2)
+ 0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
- 0, 2)
-#endif // HAS_NV12TORGB565ROW_SSSE3
+ 0, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C,
+ 0, 2, 15)
+NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C,
+ 0, 2, 15)
+#endif
#ifdef HAS_NV12TORGB565ROW_NEON
-NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
-NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
-#endif // HAS_NV12TORGB565ROW_NEON
+NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C,
+ 0, 2, 7)
+NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C,
+ 0, 2, 7)
+#endif
#undef NVANY
-#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
- void NAMEANY(const uint8* src, \
- uint8* dst, \
- int width) { \
+#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src, uint8* dst, int width) { \
int n = width & ~MASK; \
- ARGBTORGB_SIMD(src, dst, n); \
+ if (n > 0) { \
+ ARGBTORGB_SIMD(src, dst, n); \
+ } \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
- 15, 4, 3)
+ 4, 3, 15)
RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
- 15, 4, 3)
+ 4, 3, 15)
RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
- 3, 4, 2)
+ 4, 2, 3)
RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
- 3, 4, 2)
+ 4, 2, 3)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
- 3, 4, 2)
+ 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C,
+ 4, 2, 7)
+RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C,
+ 4, 2, 7)
+RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
+ 4, 2, 7)
#endif
+
#if defined(HAS_I400TOARGBROW_SSE2)
-RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
- 7, 1, 4)
+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
#endif
#if defined(HAS_YTOARGBROW_SSE2)
-RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
- 7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
- 15, 2, 4)
-RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
- 15, 2, 4)
-// These require alignment on ARGB, so C is used for remainder.
+RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7)
+#endif
+#if defined(HAS_YTOARGBROW_AVX2)
+RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C, 1, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 2, 4, 15)
+RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 2, 4, 15)
RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
- 15, 3, 4)
-RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
- 15, 3, 4)
+ 3, 4, 15)
+RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, 3, 4, 15)
RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
- 7, 2, 4)
+ 2, 4, 7)
RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
- 7, 2, 4)
+ 2, 4, 7)
RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
- 7, 2, 4)
+ 2, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31)
+RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 4, 3, 7)
+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 4, 3, 7)
RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
- 7, 4, 2)
+ 4, 2, 7)
RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
- 7, 4, 2)
+ 4, 2, 7)
RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
- 7, 4, 2)
-RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
- 7, 1, 4)
-RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
- 7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
- 7, 2, 4)
-RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
- 7, 2, 4)
+ 4, 2, 7)
+RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 1, 4, 7)
+RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, 1, 4, 7)
+RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, 2, 4, 7)
+RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7)
#endif
#undef RGBANY
// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
-#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
- void NAMEANY(const uint8* src, \
- uint8* dst, uint32 selector, \
- int width) { \
+#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \
int n = width & ~MASK; \
- ARGBTORGB_SIMD(src, dst, selector, n); \
+ if (n > 0) { \
+ ARGBTORGB_SIMD(src, dst, selector, n); \
+ } \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \
}
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
- 7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERROW_NEON)
-BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
- 7, 4, 1)
-#endif
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
- 7, 4, 1)
+ 4, 1, 7)
#endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
- 7, 4, 1)
+ 4, 1, 7)
#endif
#undef BAYERANY
-// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
-#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
- ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \
- ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \
- dst_y + (width - NUM) * BPP, NUM); \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ARGBTOY_SIMD(src_argb, dst_y, n); \
+ } \
+ ARGBTOY_C(src_argb + n * SBPP, \
+ dst_y + n * BPP, width & MASK); \
}
-
#ifdef HAS_ARGBTOYROW_AVX2
-YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
-YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
-YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
-YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
+YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_C, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, ARGBToYJRow_C, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_C, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, ARGBToYRow_C, 4, 1, 15)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, BGRAToYRow_C, 4, 1, 15)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, ABGRToYRow_C, 4, 1, 15)
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, RGBAToYRow_C, 4, 1, 15)
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, YUY2ToYRow_C, 2, 1, 15)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, UYVYToYRow_C, 2, 1, 15)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
-YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, ARGBToYJRow_C, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_NEON
-YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
-YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
-YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
-YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
-YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
-YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
-YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
-YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
-YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
-YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, ARGBToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, ARGBToYJRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, BGRAToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, ABGRToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, RGBAToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, RGB24ToYRow_C, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, RAWToYRow_C, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, RGB565ToYRow_C, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, ARGB1555ToYRow_C, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, ARGB4444ToYRow_C, 2, 1, 7)
#endif
#ifdef HAS_YUY2TOYROW_NEON
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, YUY2ToYRow_C, 2, 1, 15)
#endif
#ifdef HAS_UYVYTOYROW_NEON
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, UYVYToYRow_C, 2, 1, 15)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
-YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, RGB24ToARGBRow_C, 3, 4, 7)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
-YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, RAWToARGBRow_C, 3, 4, 7)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
-YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, RGB565ToARGBRow_C, 2, 4, 7)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
-YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, ARGB1555ToARGBRow_C,
+ 2, 4, 7)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
-YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, ARGB4444ToARGBRow_C,
+ 2, 4, 7)
#endif
-#undef YANY
-
-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
- int n = width & ~MASK; \
- ARGBTOY_SIMD(src_argb, dst_y, n); \
- ARGBTOY_C(src_argb + n * SBPP, \
- dst_y + n * BPP, width & MASK); \
- }
-
-// Attenuate is destructive so last16 method can not be used due to overlap.
#ifdef HAS_ARGBATTENUATEROW_SSSE3
YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
4, 4, 3)
@@ -318,7 +364,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
- ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
+ if (n > 0) { \
+ ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
+ } \
ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
dst_u + (n >> 1), \
dst_v + (n >> 1), \
@@ -327,29 +375,50 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
#ifdef HAS_ARGBTOUVROW_AVX2
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
-UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
- 4, 15)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
+#ifdef HAS_YUY2TOUVROW_SSE2
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
#endif
#ifdef HAS_YUY2TOUVROW_NEON
@@ -360,11 +429,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif
#undef UVANY
-#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \
- void NAMEANY(const uint8* src_uv, \
- uint8* dst_u, uint8* dst_v, int width) { \
+#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, SHIFT, MASK) \
+ void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
- ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ if (n > 0) { \
+ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ } \
ANYTOUV_C(src_uv + n * BPP, \
dst_u + (n >> SHIFT), \
dst_v + (n >> SHIFT), \
@@ -372,42 +442,45 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
}
#ifdef HAS_ARGBTOUV444ROW_SSSE3
-UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
- ARGBToUV444Row_C, 4, 15, 0)
+UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3,
+ ARGBToUV444Row_C, 4, 0, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
- YUY2ToUV422Row_C, 2, 31, 1)
+ YUY2ToUV422Row_C, 2, 1, 31)
UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
- UYVYToUV422Row_C, 2, 31, 1)
+ UYVYToUV422Row_C, 2, 1, 31)
#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
- ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
- YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
- UYVYToUV422Row_C, 2, 15, 1)
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3,
+ ARGBToUV422Row_C, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2,
+ YUY2ToUV422Row_C, 2, 1, 15)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2,
+ UYVYToUV422Row_C, 2, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
- ARGBToUV444Row_C, 4, 7, 0)
+ ARGBToUV444Row_C, 4, 0, 7)
UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
- ARGBToUV422Row_C, 4, 15, 1)
+ ARGBToUV422Row_C, 4, 1, 15)
UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
- ARGBToUV411Row_C, 4, 31, 2)
+ ARGBToUV411Row_C, 4, 2, 31)
UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
- YUY2ToUV422Row_C, 2, 15, 1)
+ YUY2ToUV422Row_C, 2, 1, 15)
UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
- UYVYToUV422Row_C, 2, 15, 1)
+ UYVYToUV422Row_C, 2, 1, 15)
#endif
#undef UV422ANY
#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
- void NAMEANY(const uint8* src_uv, \
- uint8* dst_u, uint8* dst_v, int width) { \
+ void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
- ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ if (n > 0) { \
+ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ } \
ANYTOUV_C(src_uv + n * 2, \
dst_u + n, \
dst_v + n, \
@@ -415,7 +488,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
}
#ifdef HAS_SPLITUVROW_SSE2
-SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_AVX2
SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
@@ -424,7 +497,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2,
SplitUVRow_C, 15)
#endif
#undef SPLITUVROWANY
@@ -433,7 +506,9 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
void NAMEANY(const uint8* src_u, const uint8* src_v, \
uint8* dst_uv, int width) { \
int n = width & ~MASK; \
- ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
+ if (n > 0) { \
+ ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
+ } \
ANYTOUV_C(src_u + n, \
src_v + n, \
dst_uv + n * 2, \
@@ -441,7 +516,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
}
#ifdef HAS_MERGEUVROW_SSE2
-MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
@@ -455,7 +530,9 @@ MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
uint8* dst_argb, int width) { \
int n = width & ~MASK; \
- ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
+ if (n > 0) { \
+ ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
+ } \
ARGBMATH_C(src_argb0 + n * 4, \
src_argb1 + n * 4, \
dst_argb + n * 4, \
@@ -502,7 +579,9 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
void NAMEANY(const uint8* src_argb, uint8* dst_argb, \
const uint8* shuffler, int width) { \
int n = width & ~MASK; \
- ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
+ if (n > 0) { \
+ ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
+ } \
ARGBTOY_C(src_argb + n * SBPP, \
dst_argb + n * BPP, shuffler, width & MASK); \
}
@@ -512,7 +591,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
ARGBShuffleRow_C, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3,
ARGBShuffleRow_C, 4, 4, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_AVX2
@@ -531,35 +610,107 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
ptrdiff_t src_stride_ptr, int width, \
int source_y_fraction) { \
int n = width & ~MASK; \
- TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \
- n, source_y_fraction); \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
+ } \
TERP_C(dst_ptr + n * BPP, \
src_ptr + n * SBPP, src_stride_ptr, \
width & MASK, source_y_fraction); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
- InterpolateRow_C, 1, 1, 32)
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
- InterpolateRow_C, 1, 1, 15)
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
- InterpolateRow_C, 1, 1, 15)
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
- InterpolateRow_C, 1, 1, 15)
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
- InterpolateRow_C, 1, 1, 3)
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
+ 1, 1, 3)
#endif
#undef NANY
+#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
+ int n = width & ~MASK; \
+ int r = width & MASK; \
+ if (n > 0) { \
+ MIRROR_SIMD(src_y, dst_y + r * BPP, n); \
+ } \
+ MIRROR_C(src_y + n * BPP, dst_y, r); \
+ }
+
+#ifdef HAS_MIRRORROW_AVX2
+MANY(MirrorRow_Any_AVX2, MirrorRow_AVX2, MirrorRow_C, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+MANY(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, MirrorRow_C, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_SSE2
+MANY(MirrorRow_Any_SSE2, MirrorRow_SSE2, MirrorRow_C, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+MANY(MirrorRow_Any_NEON, MirrorRow_NEON, MirrorRow_C, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+MANY(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, ARGBMirrorRow_C, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+MANY(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, ARGBMirrorRow_C, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
+#endif
+#undef MANY
+
+#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
+ int n = width & ~MASK; \
+ int r = width & MASK; \
+ if (n > 0) { \
+ COPY_SIMD(src_y, dst_y, n); \
+ } \
+ COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
+ }
+
+#ifdef HAS_COPYROW_AVX
+MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
+#endif
+#undef MANY
+
+#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \
+ void NAMEANY(uint8* dst_y, T v8, int width) { \
+ int n = width & ~MASK; \
+ int r = width & MASK; \
+ if (n > 0) { \
+ SET_SIMD(dst_y, v8, n); \
+ } \
+ SET_C(dst_y + n * BPP, v8, r); \
+ }
+
+#ifdef HAS_SETROW_X86
+SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3)
+#endif
+#undef SETANY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc
index fa2b752a2ae..e0e2bf4261d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc
@@ -199,6 +199,32 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+ const uint8* dither8x8, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ int dither0 = dither8x8[x & 7] - 128;
+ int dither1 = dither8x8[(x & 7) + 1] - 128;
+ uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+ uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+ uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+ uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
+ uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
+ uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ int dither0 = dither8x8[(width - 1) & 7] - 128;
+ uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+ uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+ uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -385,6 +411,28 @@ void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
MAKEROWYJ(ARGB, 2, 1, 0, 4)
#undef MAKEROWYJ
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToUJ(ar, ag, ab);
+ dst_v[0] = RGBToVJ(ar, ag, ab);
+ src_argb += 8;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToUJ(ar, ag, ab);
+ dst_v[0] = RGBToVJ(ar, ag, ab);
+ }
+}
+
void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -938,33 +986,52 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
}
}
-// C reference code that mimics the YUV assembly.
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+// C reference code that mimics the YUV assembly.
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
- int32 y1 = ((int32)(y) - 16) * YG;
- *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
- *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
- *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
+ uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32)(BB - ( u * UB) + y1) >> 6);
+ *g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6);
+ *r = Clamp((int32)(BR - (v * VR ) + y1) >> 6);
}
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32)(y1 - YGB) >> 6);
+ *g = Clamp((int32)(y1 - YGB) >> 6);
+ *r = Clamp((int32)(y1 - YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
// TODO(fbarchard): Remove subsampling from Neon.
void I444ToARGBRow_C(const uint8* src_y,
@@ -1008,6 +1075,7 @@ void I444ToARGBRow_C(const uint8* src_y,
}
}
#endif
+
// Also used for 420
void I422ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
@@ -1034,6 +1102,59 @@ void I422ToARGBRow_C(const uint8* src_y,
}
}
+// C reference code that mimics the YUV assembly.
+// * R = Y + 1.40200 * Cr
+// * G = Y - 0.34414 * Cb - 0.71414 * Cr
+// * B = Y + 1.77200 * Cb
+
+#define YGJ 64 /* (int8)round(1.000 * 64) */
+
+#define UBJ 113 /* (int8)round(1.772 * 64) */
+#define UGJ -22 /* (int8)round(-0.34414 * 64) */
+#define URJ 0
+
+#define VBJ 0
+#define VGJ -46 /* (int8)round(-0.71414 * 64) */
+#define VRJ 90 /* (int8)round(1.402 * 64) */
+
+// Bias
+#define BBJ (UBJ * 128 + VBJ * 128)
+#define BGJ (UGJ * 128 + VGJ * 128)
+#define BRJ (URJ * 128 + VRJ * 128)
+
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * YGJ);
+ *b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
+ *g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
+ *r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
+}
+
+void J422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvJPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvJPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvJPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
void I422ToRGB24Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1470,18 +1591,15 @@ void I422ToRGBARow_C(const uint8* src_y,
void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], 128, 128,
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
- YuvPixel(src_y[1], 128, 128,
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], 128, 128,
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
@@ -1569,28 +1687,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
memcpy(dst, src, count * 2);
}
-void SetRow_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
- // VC will generate rep stosb.
- int x;
- for (x = 0; x < count; ++x) {
- dst[x] = v8;
- }
-#else
- memset(dst, v8, count);
-#endif
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+ memset(dst, v8, width);
}
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- int y;
- for (y = 0; y < height; ++y) {
- uint32* d = (uint32*)(dst);
- int x;
- for (x = 0; x < width; ++x) {
- d[x] = v32;
- }
- dst += dst_stride;
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+ uint32* d = (uint32*)(dst_argb);
+ int x;
+ for (x = 0; x < width; ++x) {
+ d[x] = v32;
}
}
@@ -1885,17 +1990,17 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
}
}
-// Blend 2 rows into 1 for conversions such as I422ToI420.
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
int x;
for (x = 0; x < pix; ++x) {
dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
}
}
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
- uint16* dst_uv, int pix) {
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+ uint16* dst_uv, int pix) {
int x;
for (x = 0; x < pix; ++x) {
dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -1957,24 +2062,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
}
-// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
-void ARGBToBayerRow_C(const uint8* src_argb,
- uint8* dst_bayer, uint32 selector, int pix) {
- int index0 = selector & 0xff;
- int index1 = (selector >> 8) & 0xff;
- // Copy a row of Bayer.
- int x;
- for (x = 0; x < pix - 1; x += 2) {
- dst_bayer[0] = src_argb[index0];
- dst_bayer[1] = src_argb[index1];
- src_argb += 8;
- dst_bayer += 2;
- }
- if (pix & 1) {
- dst_bayer[0] = src_argb[index0];
- }
-}
-
// Select G channel from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
@@ -2061,122 +2148,272 @@ void I422ToUYVYRow_C(const uint8* src_y,
}
}
-#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
-#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* rgb_buf,
+ uint8* dst_rgb565,
int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
- ARGBToRGB565Row_SSE2(row, rgb_buf, width);
- free_aligned_buffer_64(row);
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
}
-#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
+#endif
-#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* rgb_buf,
+ uint8* dst_argb1555,
int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
- ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
- free_aligned_buffer_64(row);
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
}
+#endif
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
void I422ToARGB4444Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* rgb_buf,
+ uint8* dst_argb4444,
int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
- ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
- free_aligned_buffer_64(row);
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
}
+#endif
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
- ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
- free_aligned_buffer_64(row);
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+ uint8* dst_rgb565, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
}
+#endif
-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_rgb565,
- int width) {
- // Allocate a row of ARGB.
- align_buffer_64(row, width * 4);
- NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
- ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
- free_aligned_buffer_64(row);
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+ uint8* dst_rgb565, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
}
+#endif
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
- YUY2ToYRow_SSE2(src_yuy2, row_y, width);
- I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
- YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
- I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
- UYVYToYRow_SSE2(src_uyvy, row_y, width);
- I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
- UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
- I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
-#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+ YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+ src_yuy2 += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+ UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+ src_uyvy += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86)
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width) {
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+ ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+ ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+ uint8* dst_rgb565, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+ uint8* dst_rgb565, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+ YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+ I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+ src_yuy2 += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+ UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+ I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+ src_uyvy += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
#endif // !defined(LIBYUV_DISABLE_X86)
void ARGBPolynomialRow_C(const uint8* src_argb,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc
index ae9370c1b02..cfc9ffe0368 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc
@@ -378,7 +378,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
+ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
@@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
);
}
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
- uint8* dst_v, int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- ".p2align 2 \n"
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lwr $t0, 0(%[src_uv]) \n"
- "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lwr $t1, 4(%[src_uv]) \n"
- "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lwr $t2, 8(%[src_uv]) \n"
- "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lwr $t3, 12(%[src_uv]) \n"
- "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lwr $t5, 16(%[src_uv]) \n"
- "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lwr $t6, 20(%[src_uv]) \n"
- "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
- "lwr $t7, 24(%[src_uv]) \n"
- "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
- "lwr $t8, 28(%[src_uv]) \n"
- "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
- "addiu %[src_uv], %[src_uv], 32 \n"
- "swr $t9, 0(%[dst_v]) \n"
- "swl $t9, 3(%[dst_v]) \n"
- "swr $t0, 0(%[dst_u]) \n"
- "swl $t0, 3(%[dst_u]) \n"
- "swr $t1, 4(%[dst_v]) \n"
- "swl $t1, 7(%[dst_v]) \n"
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n"
- "swr $t3, 8(%[dst_v]) \n"
- "swl $t3, 11(%[dst_v]) \n"
- "swr $t5, 8(%[dst_u]) \n"
- "swl $t5, 11(%[dst_u]) \n"
- "swr $t6, 12(%[dst_v]) \n"
- "swl $t6, 15(%[dst_v]) \n"
- "swr $t7, 12(%[dst_u]) \n"
- "swl $t7, 15(%[dst_u]) \n"
- "addiu %[dst_u], %[dst_u], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_v], %[dst_v], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [width] "+r" (width),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6", "t7", "t8", "t9"
- );
-}
-
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
@@ -927,9 +844,9 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
}
// Bilinear filter 8x2 -> 8x1
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc
index 1392cf5fcc3..8badc5a9b94 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc
@@ -16,7 +16,8 @@ extern "C" {
#endif
// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
@@ -92,36 +93,73 @@ extern "C" {
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
+#define YUV422TORGB_SETUP_REG \
+ "vld1.8 {d24}, [%[kUVToRB]] \n" \
+ "vld1.8 {d25}, [%[kUVToG]] \n" \
+ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
+ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
+
#define YUV422TORGB \
- "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
- "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
- "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
- "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
- "vtrn.u8 d0, d1 \n" \
- "vsub.s16 q0, q0, q15 \n"/* offset y */\
- "vmul.s16 q0, q0, q14 \n" \
+ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
+ "vmull.u8 q9, d2, d25 \n" /* u/v G component */\
+ "vmovl.u8 q0, d0 \n" /* Y */\
+ "vmovl.s16 q10, d1 \n" \
+ "vmovl.s16 q0, d0 \n" \
+ "vmul.s32 q10, q10, q15 \n" \
+ "vmul.s32 q0, q0, q15 \n" \
+ "vqshrun.s32 d0, q0, #16 \n" \
+ "vqshrun.s32 d1, q10, #16 \n" /* Y */\
"vadd.s16 d18, d19 \n" \
- "vqadd.s16 d20, d0, d16 \n" /* B */ \
- "vqadd.s16 d21, d1, d16 \n" \
- "vqadd.s16 d22, d0, d17 \n" /* R */ \
- "vqadd.s16 d23, d1, d17 \n" \
- "vqadd.s16 d16, d0, d18 \n" /* G */ \
- "vqadd.s16 d17, d1, d18 \n" \
- "vqshrun.s16 d0, q10, #6 \n" /* B */ \
- "vqshrun.s16 d1, q11, #6 \n" /* G */ \
- "vqshrun.s16 d2, q8, #6 \n" /* R */ \
- "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
- "vmovl.u8 q11, d1 \n" \
- "vmovl.u8 q8, d2 \n" \
- "vtrn.u8 d20, d21 \n" \
- "vtrn.u8 d22, d23 \n" \
- "vtrn.u8 d16, d17 \n" \
- "vmov.u8 d21, d16 \n"
-
-static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
- 0, 0, 0, 0, 0, 0, 0, 0 };
+ "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
+ "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
+ "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
+ "vaddw.u16 q1, q1, d16 \n" \
+ "vaddw.u16 q10, q10, d17 \n" \
+ "vaddw.u16 q3, q3, d18 \n" \
+ "vqadd.s16 q8, q0, q13 \n" /* B */ \
+ "vqadd.s16 q9, q0, q14 \n" /* R */ \
+ "vqadd.s16 q0, q0, q4 \n" /* G */ \
+ "vqadd.s16 q8, q8, q1 \n" /* B */ \
+ "vqadd.s16 q9, q9, q10 \n" /* R */ \
+ "vqsub.s16 q0, q0, q3 \n" /* G */ \
+ "vqshrun.s16 d20, q8, #6 \n" /* B */ \
+ "vqshrun.s16 d22, q9, #6 \n" /* R */ \
+ "vqshrun.s16 d21, q0, #6 \n" /* G */
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+static uvec8 kUVToRB = { 128, 128, 128, 128, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@@ -129,13 +167,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV444
@@ -150,8 +182,10 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -163,13 +197,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -184,8 +212,10 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -197,13 +227,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV411
@@ -218,8 +242,10 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -231,13 +257,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -253,8 +273,10 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_bgra), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -266,13 +288,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -288,8 +304,10 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_abgr), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -301,13 +319,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -322,8 +334,10 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgba), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -335,13 +349,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -355,8 +363,10 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgb24), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -368,13 +378,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -389,8 +393,10 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_raw), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -414,13 +420,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -435,8 +435,10 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgb565), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -463,13 +465,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@@ -485,8 +481,10 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb1555), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -507,13 +505,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n"
@@ -530,8 +522,10 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb4444), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -541,13 +535,7 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV400
@@ -560,8 +548,10 @@ void YToARGBRow_NEON(const uint8* src_y,
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
+ : [kUVToRB]"r"(&kUVToRB), // %3
+ [kUVToG]"r"(&kUVToG), // %4
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -595,13 +585,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV12
@@ -615,8 +599,10 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -627,13 +613,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV21
@@ -647,8 +627,10 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -659,13 +641,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV12
@@ -679,8 +655,10 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -691,13 +669,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV21
@@ -711,8 +683,10 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -722,13 +696,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUY2
@@ -741,8 +709,10 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
+ : [kUVToRB]"r"(&kUVToRB), // %3
+ [kUVToG]"r"(&kUVToG), // %4
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -752,13 +722,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
+ YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READUYVY
@@ -771,8 +735,10 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
+ : [kUVToRB]"r"(&kUVToRB), // %3
+ [kUVToG]"r"(&kUVToG), // %4
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -844,30 +810,36 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
);
}
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
- "1: \n"
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
- : "r"(v32) // %2
+ : "r"(v8) // %2
: "cc", "memory", "q0"
);
}
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- SetRow_NEON(dst, v32, width << 2);
- dst += dst_stride;
- }
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0"
+ );
}
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
@@ -1273,53 +1245,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
);
}
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
- "vrhadd.u8 q0, q1 \n" // average row 1 and 2
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(src_uv_stride), // %1
- "+r"(dst_uv), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- "vmov.u32 d6[0], %3 \n" // selector
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
- "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
- "vtrn.u32 d4, d5 \n" // combine 8 pixels
- MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n" // store 8.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "r"(selector) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
// Select G channels from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
@@ -2832,7 +2757,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
"vmovl.u8 q9, d18 \n" // g
"vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q15, d22 \n" // a
+ "vmovl.u8 q11, d22 \n" // a
"vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
"vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
"vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
@@ -2853,10 +2778,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
@@ -2872,7 +2797,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(matrix_argb) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+ : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
@@ -3140,7 +3065,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "q0", "q1" // Clobber List
);
}
-#endif // __ARM_NEON__
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc
index 21111cf60f3..ddccd5d98b7 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -15,113 +15,157 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for GCC Neon
+// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
- "vld1.32 {d2[0]}, [%1]! \n" \
+ "ld1 {v1.s}[0], [%1], #4 \n" \
MEMACCESS(2) \
- "vld1.32 {d2[1]}, [%2]! \n"
+ "ld1 {v1.s}[1], [%2], #4 \n"
// Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \
MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
- "vld1.16 {d2[0]}, [%1]! \n" \
+ "ld1 {v2.h}[0], [%1], #2 \n" \
MEMACCESS(2) \
- "vld1.16 {d2[1]}, [%2]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vzip.u8 d2, d3 \n"
+ "ld1 {v2.h}[1], [%2], #2 \n" \
+ "zip1 v1.8b, v2.8b, v2.8b \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
+ "ld1 {v1.d}[0], [%1], #8 \n" \
MEMACCESS(2) \
- "vld1.8 {d3}, [%2]! \n" \
- "vpaddl.u8 q1, q1 \n" \
- "vrshrn.u16 d2, q1, #1 \n"
+ "ld1 {v1.d}[1], [%2], #8 \n" \
+ "uaddlp v1.8h, v1.16b \n" \
+ "rshrn v1.8b, v1.8h, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- "vmov.u8 d2, #128 \n"
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "movi v1.8b , #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
// Read 8 Y and 4 VU from NV21
#define READNV21 \
MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n"
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v3.8b, v2.8b, v2.8b \n" \
+ "uzp2 v1.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
// Read 8 YUY2
#define READYUY2 \
MEMACCESS(0) \
- "vld2.8 {d0, d2}, [%0]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
+ "uzp2 v3.8b, v1.8b, v1.8b \n" \
+ "uzp1 v1.8b, v1.8b, v1.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
// Read 8 UYVY
#define READUYVY \
MEMACCESS(0) \
- "vld2.8 {d2, d3}, [%0]! \n" \
- "vmov.u8 d0, d3 \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-#define YUV422TORGB \
- "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
- "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
- "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
- "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
- "vtrn.u8 d0, d1 \n" \
- "vsub.s16 q0, q0, q15 \n"/* offset y */\
- "vmul.s16 q0, q0, q14 \n" \
- "vadd.s16 d18, d19 \n" \
- "vqadd.s16 d20, d0, d16 \n" /* B */ \
- "vqadd.s16 d21, d1, d16 \n" \
- "vqadd.s16 d22, d0, d17 \n" /* R */ \
- "vqadd.s16 d23, d1, d17 \n" \
- "vqadd.s16 d16, d0, d18 \n" /* G */ \
- "vqadd.s16 d17, d1, d18 \n" \
- "vqshrun.s16 d0, q10, #6 \n" /* B */ \
- "vqshrun.s16 d1, q11, #6 \n" /* G */ \
- "vqshrun.s16 d2, q8, #6 \n" /* R */ \
- "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
- "vmovl.u8 q11, d1 \n" \
- "vmovl.u8 q8, d2 \n" \
- "vtrn.u8 d20, d21 \n" \
- "vtrn.u8 d22, d23 \n" \
- "vtrn.u8 d16, d17 \n" \
- "vmov.u8 d21, d16 \n"
-
-static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
- 0, 0, 0, 0, 0, 0, 0, 0 };
+ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
+ "orr v0.8b, v3.8b, v3.8b \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+#define YUV422TORGB_SETUP_REG \
+ "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "movi v27.8h, #128 \n" \
+ "movi v28.8h, #102 \n" \
+ "movi v29.8h, #25 \n" \
+ "movi v30.8h, #52 \n"
+
+#define YUV422TORGB(vR, vG, vB) \
+ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
+ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
+ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
+ "ushll v0.4s, v0.4h, #0 \n" \
+ "mul v3.4s, v3.4s, v31.4s \n" \
+ "mul v0.4s, v0.4s, v31.4s \n" \
+ "sqshrun v0.4h, v0.4s, #16 \n" \
+ "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
+ "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
+ "uxtl v2.8h, v2.8b \n" \
+ "uxtl v1.8h, v1.8b \n" /* Extract U */ \
+ "mul v3.8h, v1.8h, v27.8h \n" \
+ "mul v5.8h, v1.8h, v29.8h \n" \
+ "mul v6.8h, v2.8h, v30.8h \n" \
+ "mul v7.8h, v2.8h, v28.8h \n" \
+ "sqadd v6.8h, v6.8h, v5.8h \n" \
+ "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
+ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
#ifdef HAS_I444TOARGBROW_NEON
void I444ToARGBRow_NEON(const uint8* src_y,
@@ -130,31 +174,24 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV444
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I444TOARGBROW_NEON
@@ -166,31 +203,24 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TOARGBROW_NEON
@@ -202,31 +232,24 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV411
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I411TOARGBROW_NEON
@@ -238,32 +261,24 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v21, v22, v23)
"subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d19, #255 \n"
+ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_bgra), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TOBGRAROW_NEON
@@ -275,32 +290,24 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_abgr), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TOABGRROW_NEON
@@ -312,31 +319,24 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v23, v22, v21)
"subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n"
+ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_rgba), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TORGBAROW_NEON
@@ -348,30 +348,23 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TORGB24ROW_NEON
@@ -383,46 +376,33 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_raw), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TORAWROW_NEON
#define ARGBTORGB565 \
- "vshr.u8 d20, d20, #3 \n" /* B */ \
- "vshr.u8 d21, d21, #2 \n" /* G */ \
- "vshr.u8 d22, d22, #3 \n" /* R */ \
- "vmovl.u8 q8, d20 \n" /* B */ \
- "vmovl.u8 q9, d21 \n" /* G */ \
- "vmovl.u8 q10, d22 \n" /* R */ \
- "vshl.u16 q9, q9, #5 \n" /* G */ \
- "vshl.u16 q10, q10, #11 \n" /* R */ \
- "vorr q0, q8, q9 \n" /* BG */ \
- "vorr q0, q0, q10 \n" /* BGR */
+ "shll v0.8h, v22.8b, #8 \n" /* R */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "sri v0.8h, v21.8h, #5 \n" /* RG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* RGB */
#ifdef HAS_I422TORGB565ROW_NEON
void I422ToRGB565Row_NEON(const uint8* src_y,
@@ -431,49 +411,36 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
ARGBTORGB565
MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_rgb565), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TORGB565ROW_NEON
#define ARGBTOARGB1555 \
- "vshr.u8 q10, q10, #3 \n" /* B */ \
- "vshr.u8 d22, d22, #3 \n" /* R */ \
- "vshr.u8 d23, d23, #7 \n" /* A */ \
- "vmovl.u8 q8, d20 \n" /* B */ \
- "vmovl.u8 q9, d21 \n" /* G */ \
- "vmovl.u8 q10, d22 \n" /* R */ \
- "vmovl.u8 q11, d23 \n" /* A */ \
- "vshl.u16 q9, q9, #5 \n" /* G */ \
- "vshl.u16 q10, q10, #10 \n" /* R */ \
- "vshl.u16 q11, q11, #15 \n" /* A */ \
- "vorr q0, q8, q9 \n" /* BG */ \
- "vorr q1, q10, q11 \n" /* RA */ \
- "vorr q0, q0, q1 \n" /* BGRA */
+ "shll v0.8h, v23.8b, #8 \n" /* A */ \
+ "shll v22.8h, v22.8b, #8 \n" /* R */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "sri v0.8h, v22.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* ARGB */
#ifdef HAS_I422TOARGB1555ROW_NEON
void I422ToARGB1555Row_NEON(const uint8* src_y,
@@ -482,44 +449,38 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB1555
MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_argb1555), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TOARGB1555ROW_NEON
#define ARGBTOARGB4444 \
- "vshr.u8 d20, d20, #4 \n" /* B */ \
- "vbic.32 d21, d21, d4 \n" /* G */ \
- "vshr.u8 d22, d22, #4 \n" /* R */ \
- "vbic.32 d23, d23, d4 \n" /* A */ \
- "vorr d0, d20, d21 \n" /* BG */ \
- "vorr d1, d22, d23 \n" /* RA */ \
- "vzip.u8 d0, d1 \n" /* BGRA */
+ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
+ "ushr v20.8b, v20.8b, #4 \n" /* B */ \
+ "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
+ "ushr v22.8b, v22.8b, #4 \n" /* R */ \
+ "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
+ "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
+ "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
+ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
#ifdef HAS_I422TOARGB4444ROW_NEON
void I422ToARGB4444Row_NEON(const uint8* src_y,
@@ -528,33 +489,26 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444,
int width) {
asm volatile (
- MEMACCESS(5)
- "vld1.8 {d24}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
READYUV422
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB4444
MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
"+r"(dst_argb4444), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_I422TOARGB4444ROW_NEON
@@ -564,29 +518,22 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV400
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_YTOARGBROW_NEON
@@ -596,22 +543,21 @@ void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- ".p2align 2 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
"subs %2, %2, #8 \n"
MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
- : "cc", "memory", "d20", "d21", "d22", "d23"
+ : "cc", "memory", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_I400TOARGBROW_NEON
@@ -622,30 +568,23 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV12
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_NV12TOARGBROW_NEON
@@ -656,30 +595,23 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV21
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_NV21TOARGBROW_NEON
@@ -690,30 +622,23 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV12
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_NV12TORGB565ROW_NEON
@@ -724,30 +649,23 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
- MEMACCESS(4)
- "vld1.8 {d24}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV21
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_NV21TORGB565ROW_NEON
@@ -757,29 +675,22 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUY2
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_YUY2TOARGBROW_NEON
@@ -789,29 +700,22 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
asm volatile (
- MEMACCESS(3)
- "vld1.8 {d24}, [%3] \n"
- MEMACCESS(4)
- "vld1.8 {d25}, [%4] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READUYVY
- YUV422TORGB
+ YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n"
- "vmov.u8 d23, #255 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : "r"(&kUVToRB), // %3
- "r"(&kUVToG) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_UYVYTOARGBROW_NEON
@@ -821,16 +725,15 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2)
"st1 {v1.16b}, [%2], #16 \n" // store V
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -846,7 +749,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load U
@@ -854,8 +756,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2)
- "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "bgt 1b \n"
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
:
"+r"(src_u), // %0
"+r"(src_v), // %1
@@ -871,14 +773,13 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#ifdef HAS_COPYROW_NEON
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
+ "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1)
- "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
- "bgt 1b \n"
+ "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
@@ -888,35 +789,36 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_NEON
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-#ifdef HAS_SETROW_NEON
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
- "dup v0.4s, %w2 \n" // duplicate 4 ints
- "1: \n"
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
- : "r"(v32) // %2
+ : "r"(v8) // %2
: "cc", "memory", "v0"
);
}
-#endif // HAS_SETROW_NEON
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-#ifdef HAS_ARGBSETROWS_NEON
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- SetRow_NEON(dst, v32, width << 2);
- dst += dst_stride;
- }
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 ints per loop
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "v0"
+ );
}
-#endif // HAS_ARGBSETROWS_NEON
#ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
@@ -925,7 +827,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n"
"sub %0, %0, #16 \n"
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
@@ -935,7 +836,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
"st1 {v0.D}[0], [%1], #8 \n"
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -953,7 +854,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"add %0, %0, %3, lsl #1 \n"
"sub %0, %0, #16 \n"
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
@@ -961,10 +861,10 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n"
MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n"
- "bgt 1b \n"
+ "st1 {v1.8b}, [%2], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -982,7 +882,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n"
"sub %0, %0, #16 \n"
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
@@ -992,7 +891,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
"st1 {v0.D}[0], [%1], #8 \n"
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -1006,14 +905,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"movi v4.8b, #255 \n" // Alpha
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -1027,16 +925,15 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"movi v5.8b, #255 \n" // Alpha
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop.
- "mov v3.8b, v1.8b \n" // move g
- "mov v4.8b, v0.8b \n" // move r
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
MEMACCESS(1)
- "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
- "bgt 1b \n"
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -1047,118 +944,127 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
#endif // HAS_RAWTOARGBROW_NEON
#define RGB565TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
- "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
- "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
+ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
+ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
+ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
+ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
+ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
+ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
+ "dup v2.2D, v0.D[1] \n" /* R */
#ifdef HAS_RGB565TOARGBROW_NEON
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
);
}
#endif // HAS_RGB565TOARGBROW_NEON
#define ARGB1555TOARGB \
- "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
- "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
- "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
- "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
- "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
- "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
- "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
- "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
- "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
- "vorr.u8 q1, q1, q3 \n" /* R,A */ \
- "vorr.u8 q0, q0, q2 \n" /* B,G */ \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
+ \
+ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
+ "xtn2 v3.16b, v2.8h \n" \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
+ "dup v1.2D, v0.D[1] \n" \
+ "dup v3.2D, v2.D[1] \n"
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
- "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
- "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
+ "dup v1.2D, v0.D[1] \n" /* G */ \
#ifdef HAS_ARGB1555TOARGBROW_NEON
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
int pix) {
asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_ARGB1555TOARGBROW_NEON
#define ARGB4444TOARGB \
- "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
- "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
- "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
- "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
- "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
- "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
- "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
- "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
+ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
+ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
+ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
+ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
+ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
+ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
+ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
+ "dup v0.2D, v2.D[1] \n" \
+ "dup v1.2D, v3.D[1] \n"
#ifdef HAS_ARGB4444TOARGBROW_NEON
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
int pix) {
asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_ARGB4444TOARGBROW_NEON
@@ -1166,14 +1072,13 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
#ifdef HAS_ARGBTORGB24ROW_NEON
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
- "bgt 1b \n"
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(pix) // %2
@@ -1186,16 +1091,15 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
#ifdef HAS_ARGBTORAWROW_NEON
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop.
- "mov v4.8b, v2.8b \n" // mov g
- "mov v5.8b, v1.8b \n" // mov b
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
MEMACCESS(1)
- "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
- "bgt 1b \n"
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(pix) // %2
@@ -1208,14 +1112,13 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
#ifdef HAS_YUY2TOYROW_NEON
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
@@ -1228,14 +1131,13 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
#ifdef HAS_UYVYTOYROW_NEON
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
@@ -1249,16 +1151,15 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1273,16 +1174,15 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1296,29 +1196,29 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
#ifdef HAS_YUY2TOUVROW_NEON
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
- "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2)
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
- "+r"(stride_yuy2), // %1
+ "+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+ "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_YUY2TOUVROW_NEON
@@ -1326,84 +1226,33 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
#ifdef HAS_UYVYTOUVROW_NEON
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
- "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
- "+r"(stride_uyvy), // %1
+ "+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+ "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_UYVYTOUVROW_NEON
-#ifdef HAS_HALFROW_NEON
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %x1, %x0, %w1, sxtw \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
- "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(src_uv_stride), // %1
- "+r"(dst_uv), // %2
- "+r"(pix) // %3
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-#endif // HAS_HALFROW_NEON
-
-// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
-#ifdef HAS_ARGBTOBAYERROW_NEON
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- "mov v2.s[0], %w3 \n" // selector
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
- "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
- "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
- MEMACCESS(1)
- "st1 {v4.8b}, [%1], #8 \n" // store 8.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "r"(selector) // %3
- : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
- );
-}
-#endif // HAS_ARGBTOBAYERROW_NEON
-
// Select G channels from ARGB. e.g. GGGGGGGG
#ifdef HAS_ARGBTOBAYERGGROW_NEON
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
@@ -1411,11 +1260,11 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
@@ -1439,7 +1288,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 4.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -1455,19 +1304,18 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_v,
uint8* dst_yuy2, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "mov v2.8b, v1.8b \n"
+ "orr v2.8b, v1.8b, v1.8b \n"
MEMACCESS(1)
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1485,19 +1333,18 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_v,
uint8* dst_uyvy, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
- "mov v3.8b, v2.8b \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v3.8b, v2.8b, v2.8b \n"
MEMACCESS(1)
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1512,20 +1359,19 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
#ifdef HAS_ARGBTORGB565ROW_NEON
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_ARGBTORGB565ROW_NEON
@@ -1534,20 +1380,19 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_ARGBTOARGB1555ROW_NEON
@@ -1556,21 +1401,20 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
int pix) {
asm volatile (
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_ARGBTOARGB4444ROW_NEON
@@ -1582,10 +1426,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
@@ -1594,7 +1437,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
@@ -1610,10 +1453,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
@@ -1621,7 +1463,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
@@ -1636,41 +1478,41 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
- "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+ "v24", "v25", "v26", "v27", "v28", "v29"
);
}
#endif // HAS_ARGBTOUV444ROW_NEON
@@ -1680,49 +1522,41 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"subs %3, %3, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q0, q10 \n" // B
- "vmls.s16 q8, q1, q11 \n" // G
- "vmls.s16 q8, q2, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "mul v3.8h, v0.8h, v20.8h \n" // B
+ "mls v3.8h, v1.8h, v21.8h \n" // G
+ "mls v3.8h, v2.8h, v22.8h \n" // R
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "vmul.s16 q9, q2, q10 \n" // R
- "vmls.s16 q9, q1, q14 \n" // G
- "vmls.s16 q9, q0, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "mul v4.8h, v2.8h, v20.8h \n" // R
+ "mls v4.8h, v1.8h, v24.8h \n" // G
+ "mls v4.8h, v0.8h, v23.8h \n" // B
+ "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ARGBTOUV422ROW_NEON
@@ -1732,128 +1566,108 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(0)
- "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
- "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
+ "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
- "vpadd.u16 d1, d8, d9 \n" // B
- "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
- "vpadd.u16 d3, d10, d11 \n" // G
- "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
- "vpadd.u16 d5, d12, d13 \n" // R
+ "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
+ "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
+ "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
"subs %3, %3, #32 \n" // 32 processed per loop.
- "vmul.s16 q8, q0, q10 \n" // B
- "vmls.s16 q8, q1, q11 \n" // G
- "vmls.s16 q8, q2, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q2, q10 \n" // R
- "vmls.s16 q9, q1, q14 \n" // G
- "vmls.s16 q9, q0, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "mul v3.8h, v0.8h, v20.8h \n" // B
+ "mls v3.8h, v1.8h, v21.8h \n" // G
+ "mls v3.8h, v2.8h, v22.8h \n" // R
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "mul v4.8h, v2.8h, v20.8h \n" // R
+ "mls v4.8h, v1.8h, v24.8h \n" // G
+ "mls v4.8h, v0.8h, v23.8h \n" // B
+ "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ARGBTOUV411ROW_NEON
// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
- "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
- "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
- "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
- "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
- "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
- "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
- "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
- "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
- "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
- "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
+ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
+ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
#ifdef HAS_ARGBTOUVROW_NEON
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
+ "+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ARGBTOUVROW_NEON
@@ -1862,50 +1676,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
#ifdef HAS_ARGBTOUVJROW_NEON
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
- "+r"(src_stride_argb), // %1
+ "+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ARGBTOUVJROW_NEON
@@ -1913,50 +1722,40 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
#ifdef HAS_BGRATOUVROW_NEON
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q3, q2, q1)
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
- "+r"(src_stride_bgra), // %1
+ "+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_BGRATOUVROW_NEON
@@ -1964,50 +1763,40 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
#ifdef HAS_ABGRTOUVROW_NEON
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
+ RGBTOUV(v0.8h, v2.8h, v1.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
- "+r"(src_stride_abgr), // %1
+ "+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ABGRTOUVROW_NEON
@@ -2015,50 +1804,40 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
#ifdef HAS_RGBATOUVROW_NEON
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- MEMACCESS(1)
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
- "+r"(src_stride_rgba), // %1
+ "+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_RGBATOUVROW_NEON
@@ -2066,50 +1845,40 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
#ifdef HAS_RGB24TOUVROW_NEON
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- MEMACCESS(0)
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q0, q1, q2)
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
- "+r"(src_stride_rgb24), // %1
+ "+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_RGB24TOUVROW_NEON
@@ -2117,50 +1886,40 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
#ifdef HAS_RAWTOUVROW_NEON
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- MEMACCESS(0)
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(1)
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
- RGBTOUV(q2, q1, q0)
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
- "+r"(src_stride_raw), // %1
+ "+r"(src_raw_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_RAWTOUVROW_NEON
@@ -2169,70 +1928,74 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
#ifdef HAS_RGB565TOUVROW_NEON
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
+ "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
+ "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
+ "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
+ "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
+ "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "ins v16.D[1], v17.D[0] \n"
+ "ins v18.D[1], v19.D[0] \n"
+ "ins v20.D[1], v21.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v18.8h, #1 \n"
+ "urshr v6.8h, v20.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "mul v16.8h, v4.8h, v22.8h \n" // B
+ "mls v16.8h, v5.8h, v23.8h \n" // G
+ "mls v16.8h, v6.8h, v24.8h \n" // R
+ "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
+ "mul v17.8h, v6.8h, v22.8h \n" // R
+ "mls v17.8h, v5.8h, v26.8h \n" // G
+ "mls v17.8h, v4.8h, v25.8h \n" // B
+ "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
- "+r"(src_stride_rgb565), // %1
+ "+r"(src_rgb565_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+ "v25", "v26", "v27"
);
}
#endif // HAS_RGB565TOUVROW_NEON
@@ -2241,70 +2004,69 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
#ifdef HAS_ARGB1555TOUVROW_NEON
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
- "+r"(src_stride_argb1555), // %1
+ "+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+ "v26", "v27", "v28"
);
}
#endif // HAS_ARGB1555TOUVROW_NEON
@@ -2313,70 +2075,70 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
#ifdef HAS_ARGB4444TOUVROW_NEON
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int pix) {
+ const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ RGBTOUV_SETUP_REG
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
- "+r"(src_stride_argb4444), // %1
+ "+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+ "v26", "v27", "v28"
+
);
}
#endif // HAS_ARGB4444TOUVROW_NEON
@@ -2384,29 +2146,29 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
#ifdef HAS_RGB565TOYROW_NEON
void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+ "v24", "v25", "v26", "v27"
);
}
#endif // HAS_RGB565TOYROW_NEON
@@ -2414,29 +2176,28 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
#ifdef HAS_ARGB1555TOYROW_NEON
void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGB1555TOYROW_NEON
@@ -2444,29 +2205,28 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
#ifdef HAS_ARGB4444TOYROW_NEON
void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
);
}
#endif // HAS_ARGB4444TOYROW_NEON
@@ -2474,28 +2234,27 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
#ifdef HAS_BGRATOYROW_NEON
void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_BGRATOYROW_NEON
@@ -2503,28 +2262,27 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
#ifdef HAS_ABGRTOYROW_NEON
void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
+ "+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_ABGRTOYROW_NEON
@@ -2532,28 +2290,27 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
#ifdef HAS_RGBATOYROW_NEON
void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
+ "+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_RGBATOYROW_NEON
@@ -2561,28 +2318,27 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
#ifdef HAS_RGB24TOYROW_NEON
void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_RGB24TOYROW_NEON
@@ -2590,28 +2346,27 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
#ifdef HAS_RAWTOYROW_NEON
void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
MEMACCESS(0)
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_RAWTOYROW_NEON
@@ -2621,96 +2376,98 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile (
"cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
+ "b.eq 100f \n"
"cmp %4, #64 \n"
- "beq 75f \n"
+ "b.eq 75f \n"
"cmp %4, #128 \n"
- "beq 50f \n"
+ "b.eq 50f \n"
"cmp %4, #192 \n"
- "beq 25f \n"
+ "b.eq 25f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q0}, [%2]! \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
- "+r"(src_stride), // %2
+ "+r"(src_ptr1), // %2
"+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
+ "+r"(y1_fraction), // %4
+ "+r"(y0_fraction) // %5
:
- : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+ : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
);
}
#endif // HAS_INTERPOLATEROW_NEON
@@ -2720,55 +2477,59 @@ void InterpolateRow_NEON(uint8* dst_ptr,
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
- "subs %3, #8 \n"
- "blt 89f \n"
+ "subs %3, %3, #8 \n"
+ "b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.ge 8b \n"
"89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
+ "adds %3, %3, #8-1 \n"
+ "b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1)
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
"99: \n"
@@ -2777,7 +2538,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18"
);
}
#endif // HAS_ARGBBLENDROW_NEON
@@ -2789,22 +2551,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels.
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
- "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
- "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
- "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
- : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBATTENUATEROW_NEON
@@ -2815,41 +2577,40 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
asm volatile (
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
- "vqdmulh.s16 q0, q0, q8 \n" // b * scale
- "vqdmulh.s16 q1, q1, q8 \n" // g
- "vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- MEMACCESS(0)
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ MEMACCESS(0)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
"r"(interval_size), // %3
"r"(interval_offset) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBQUANTIZEROW_NEON
@@ -2861,36 +2622,35 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
- "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
- "vqrdmulh.s16 q11, q11, d0[1] \n" // g
- "vqrdmulh.s16 q12, q12, d0[2] \n" // r
- "vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ MEMACCESS(1)
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(value) // %3
- : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+ : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBSHADEROW_NEON
@@ -2901,28 +2661,27 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBGRAYROW_NEON
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
+ "movi v24.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v25.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v26.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
);
}
#endif // HAS_ARGBGRAYROW_NEON
@@ -2935,40 +2694,39 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBSEPIAROW_NEON
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- ".p2align 2 \n"
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- MEMACCESS(0)
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
);
}
#endif // HAS_ARGBSEPIAROW_NEON
@@ -2981,60 +2739,59 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q15, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
- "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
- "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
- "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- MEMACCESS(1)
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(matrix_argb) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ARGBCOLORMATRIXROW_NEON
@@ -3046,12 +2803,11 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
@@ -3062,8 +2818,8 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -3081,20 +2837,19 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -3112,20 +2867,19 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -3148,7 +2902,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile (
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
@@ -3156,11 +2909,11 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v1.8b \n" // add
- "mov v1.8b, v0.8b \n"
- "mov v2.8b, v0.8b \n"
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -3177,7 +2930,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) {
asm volatile (
// 16 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
@@ -3187,7 +2939,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -3209,7 +2961,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile (
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
@@ -3218,8 +2969,8 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2)
- "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -3238,7 +2989,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0],%5 \n" // top
@@ -3263,7 +3013,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"uqxtn v0.8b, v0.8h \n"
MEMACCESS(3)
"st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -3284,7 +3034,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0],%4 \n" // left
@@ -3309,7 +3058,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"uqxtn v0.8b, v0.8h \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc
index 106fda56891..1a6f7dc4dd0 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc
@@ -1,3 +1,4 @@
+// VERSION 2
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
@@ -92,6 +93,7 @@ static uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
+// 7 bit fixed point 0.5.
static vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
@@ -221,7 +223,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -229,10 +231,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
#endif // TESTING
@@ -252,37 +251,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
- int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
@@ -291,11 +259,7 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
#endif // HAS_I400TOARGBROW_SSE2
@@ -318,27 +282,24 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "m"(kShuffleMaskRGB24ToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -359,27 +320,24 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "m"(kShuffleMaskRAWToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -417,9 +375,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -427,13 +384,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"+r"(dst), // %1
"+r"(pix) // %2
:
- : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -474,9 +426,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -484,13 +435,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"+r"(dst), // %1
"+r"(pix) // %2
:
- : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -518,9 +464,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
- MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -528,13 +473,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"+r"(dst), // %1
"+r"(pix) // %2
:
- : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -572,10 +512,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
"+r"(dst), // %1
"+r"(pix) // %2
: "m"(kShuffleMaskARGBToRGB24) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@@ -613,10 +550,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
"+r"(dst), // %1
"+r"(pix) // %2
: "m"(kShuffleMaskARGBToRAW) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@@ -631,7 +565,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
@@ -652,11 +586,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -672,7 +602,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n"
@@ -690,17 +620,14 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"packssdw %%xmm0,%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMACCESS2(0x8,1) ",%1 \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ :: "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -712,7 +639,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm3,%%xmm0 \n"
"pand %%xmm4,%%xmm1 \n"
@@ -728,57 +655,17 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
"movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
@@ -796,34 +683,33 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToY), // %3
"m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -836,158 +722,131 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
+#endif // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+ 0, 4, 1, 5, 2, 6, 3, 7
+};
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
-#endif // HAS_ARGBTOYJROW_SSSE3
+#endif // HAS_ARGBTOYROW_AVX2
-#ifdef HAS_ARGBTOUVROW_SSSE3
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "sub %1,%2 \n"
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
+ "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
+#endif // HAS_ARGBTOYJROW_AVX2
-// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToUJ), // %0
- "m"(kARGBToVJ), // %1
- "m"(kAddUVJ128) // %2
- );
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@@ -1005,130 +864,118 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
+ "paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToV), // %5
+ "m"(kARGBToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
);
}
+#endif // HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+ VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kARGBToV), // %6
+ "m"(kARGBToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
+#endif // HAS_ARGBTOUVROW_AVX2
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToUJ), // %0
- "m"(kARGBToVJ), // %1
- "m"(kAddUVJ128) // %2
- );
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
+
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@@ -1151,104 +998,32 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb))
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToVJ), // %5
+ "m"(kARGBToUJ), // %6
+ "m"(kAddUVJ128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
);
}
+#endif // HAS_ARGBTOUVJROW_SSSE3
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
- );
-}
-
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
- uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
@@ -1266,7 +1041,6 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
"psraw $0x8,%%xmm2 \n"
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
@@ -1283,98 +1057,30 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
- BUNDLEALIGN
MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6"
);
}
+#endif // HAS_ARGBTOUV444ROW_SSSE3
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
@@ -1403,26 +1109,23 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
);
}
+#endif // HAS_ARGBTOUV422ROW_SSSE3
void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
@@ -1430,43 +1133,6 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1482,116 +1148,41 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kBGRAToY), // %3
"m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
+
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@@ -1613,24 +1204,21 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "r"((intptr_t)(src_stride_bgra)), // %4
+ "m"(kBGRAToV), // %5
+ "m"(kBGRAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
);
}
@@ -1640,43 +1228,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1692,19 +1243,16 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kABGRToY), // %3
"m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -1714,43 +1262,6 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1766,116 +1277,41 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kRGBAToY), // %3
"m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
+
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@@ -1897,121 +1333,46 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToV), // %5
+ "m"(kABGRToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
);
}
void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kRGBAToU), // %0
- "m"(kRGBAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
- MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
- MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
- MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba))
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kRGBAToU), // %0
- "m"(kRGBAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
+
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@@ -2033,75 +1394,83 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+ : "r"((intptr_t)(src_stride_rgba)), // %4
+ "m"(kRGBAToV), // %5
+ "m"(kRGBAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
);
}
-#endif // HAS_ARGBTOUVROW_SSSE3
-#ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-struct {
- vec8 kUVToB; // 0
- vec8 kUVToG; // 16
- vec8 kUVToR; // 32
- vec16 kUVBiasB; // 48
- vec16 kUVBiasG; // 64
- vec16 kUVBiasR; // 80
- vec16 kYSub16; // 96
- vec16 kYToRgb; // 112
- vec8 kVUToB; // 128
- vec8 kVUToG; // 144
- vec8 kVUToR; // 160
-} static SIMD_ALIGNED(kYuvConstants) = {
- { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR },
- { 16, 16, 16, 16, 16, 16, 16, 16 },
- { YG, YG, YG, YG, YG, YG, YG, YG },
- { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+struct YuvConstants {
+ lvec8 kUVToB; // 0
+ lvec8 kUVToG; // 32
+ lvec8 kUVToR; // 64
+ lvec16 kUVBiasB; // 96
+ lvec16 kUVBiasG; // 128
+ lvec16 kUVBiasR; // 160
+ lvec16 kYToRgb; // 192
+};
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
};
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
// Read 8 UV from 411
#define READYUV444 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- BUNDLEALIGN \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n"
@@ -2109,7 +1478,6 @@ struct {
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 \
"movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- BUNDLEALIGN \
MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
@@ -2118,7 +1486,6 @@ struct {
// Read 2 UV from 411, upsample to 8 UV
#define READYUV411 \
"movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- BUNDLEALIGN \
MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
@@ -2132,20 +1499,23 @@ struct {
"punpcklwd %%xmm0,%%xmm0 \n"
// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB \
+#define YUVTORGB(YuvConstants) \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \
- "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \
- "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \
- "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
- "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
- "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
- "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
+ "punpcklbw %%xmm3,%%xmm3 \n" \
+ "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
@@ -2156,30 +1526,51 @@ struct {
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
-// Convert 8 pixels: 8 VU and 8 Y
-#define YVUTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \
- "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \
- "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \
- "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
- "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
- "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
- "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
+// Store 8 ARGB values. Assumes XMM5 is zero.
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm5,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm0,%%xmm1 \n" \
+ "punpcklbw %%xmm2,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \
+ "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR \
+ "punpcklbw %%xmm1,%%xmm2 \n" \
+ "punpcklbw %%xmm5,%%xmm0 \n" \
+ "movdqa %%xmm2,%%xmm1 \n" \
+ "punpcklwd %%xmm0,%%xmm2 \n" \
+ "punpckhwd %%xmm0,%%xmm1 \n" \
+ "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \
+ "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm2,%%xmm1 \n" \
+ "punpcklbw %%xmm0,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \
+ "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -2189,19 +1580,11 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ YUVTORGB(kYuvConstants)
+ STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2210,41 +1593,25 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
+// TODO(fbarchard): Consider putting masks into constants.
void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_rgb24,
int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
- asm volatile (
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
- [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
-#endif
-
asm volatile (
-#if !defined(__i386__)
"movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
"movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
-#endif
"sub %[u_buf],%[v_buf] \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm2,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -2256,25 +1623,23 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
"movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
"lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
- "sub $0x8,%[width] \n"
+ "subl $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
[width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
- , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
- [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-#endif
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
);
}
@@ -2283,26 +1648,14 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* v_buf,
uint8* dst_raw,
int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
- asm volatile (
- "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
- :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
- [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
-#endif
-
asm volatile (
-#if !defined(__i386__)
"movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
"movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
-#endif
"sub %[u_buf],%[v_buf] \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm2,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -2314,25 +1667,23 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
"movq %%xmm0," MEMACCESS([dst_raw]) " \n"
"movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
"lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
- "sub $0x8,%[width] \n"
+ "subl $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_raw]"+r"(dst_raw), // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
[width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
- , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
- [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
-#endif
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+ [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+ [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
);
}
@@ -2344,19 +1695,11 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ YUVTORGB(kYuvConstants)
+ STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2365,13 +1708,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -2383,19 +1721,11 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ YUVTORGB(kYuvConstants)
+ STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2404,13 +1734,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -2420,19 +1745,11 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ YUVTORGB(kYuvConstants)
+ STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2440,11 +1757,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
// Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -2454,216 +1768,20 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ YUVTORGB(kYuvConstants)
+ STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
+ : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
// Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -2675,20 +1793,11 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
- "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
- "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
+ YUVTORGB(kYuvConstants)
+ STOREBGRA
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2697,13 +1806,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
[dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -2715,19 +1819,11 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
- "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
- "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
+ YUVTORGB(kYuvConstants)
+ STOREABGR
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2736,13 +1832,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
[dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -2754,20 +1845,11 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
- "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
- "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
+ YUVTORGB(kYuvConstants)
+ STORERGBA
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2776,159 +1858,233 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
[dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
+#endif // HAS_I422TOARGBROW_SSSE3
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) \
+ "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
+ "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \
+ "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \
+ "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \
+ "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
+#if defined(HAS_I422TOBGRAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
- "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
- "sub $0x8,%[width] \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into BGRA
+ "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
+
+ "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+ "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
+ "sub $0x10,%[width] \n"
"jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
[width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
+#endif // HAS_I422TOBGRAROW_AVX2
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
- "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
- "sub $0x8,%[width] \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into ARGB
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
+
+ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
"jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
- [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
+#endif // HAS_I422TOARGBROW_AVX2
-void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
+#if defined(HAS_I422TOABGRROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
- "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
- "sub $0x8,%[width] \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into ABGR
+ "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
+ "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
+ "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
"jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
+#endif // HAS_I422TOABGRROW_AVX2
-#endif // HAS_I422TOARGBROW_SSSE3
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into RGBA
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_I422TORGBAROW_AVX2
#ifdef HAS_YTOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf,
- uint8* dst_argb,
- int width) {
+void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "mov $0x00100010,%%eax \n"
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "mov $0x004a004a,%%eax \n"
+ "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
LABELALIGN
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
"psubusw %%xmm3,%%xmm0 \n"
- "pmullw %%xmm2,%%xmm0 \n"
"psrlw $6, %%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
@@ -2939,8 +2095,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
@@ -2950,13 +2106,58 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"+rm"(width) // %2
:
: "memory", "cc", "eax"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
);
}
#endif // HAS_YTOARGBROW_SSE2
+#ifdef HAS_YTOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+ asm volatile (
+ "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
+ "vmovd %%eax,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+ "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
+ "vmovd %%eax,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_YTOARGBROW_AVX2
+
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
static uvec8 kShuffleMirror = {
@@ -2967,38 +2168,56 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"movdqa %3,%%xmm5 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN
"1: \n"
- MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
+ MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
: "m"(kShuffleMirror) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
);
}
#endif // HAS_MIRRORROW_SSSE3
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_MIRRORROW_AVX2
+
#ifdef HAS_MIRRORROW_SSE2
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
- "lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN
"1: \n"
- MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
+ MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
"movdqa %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
@@ -3006,21 +2225,16 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
- "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1)",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
);
}
#endif // HAS_MIRRORROW_SSE2
@@ -3035,108 +2249,119 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"movdqa %4,%%xmm1 \n"
- "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
+ "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
- "sub $8,%3 \n"
"movlpd %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $8,%3 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(temp_width) // %3
: "m"(kShuffleMirrorUV) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
);
}
#endif // HAS_MIRRORROW_UV_SSSE3
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static uvec8 kARGBShuffleMirror = {
- 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
+#ifdef HAS_ARGBMIRRORROW_SSE2
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
- "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
- : "m"(kARGBShuffleMirror) // %3
+ :
: "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
+ , "xmm0"
);
}
-#endif // HAS_ARGBMIRRORROW_SSSE3
+#endif // HAS_ARGBMIRRORROW_SSE2
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "vmovdqu %3,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror_AVX2) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
+#endif // HAS_SPLITUVROW_AVX2
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix) {
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3164,52 +2389,46 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SPLITUVROW_SSE2
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
"sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x10,%3 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
+ "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+ "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+ "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2"
);
}
+#endif // HAS_MERGEUVROW_AVX2
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width) {
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
asm volatile (
"sub %0,%1 \n"
LABELALIGN
@@ -3230,13 +2449,8 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
"+r"(dst_uv), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_MERGEUVROW_SSE2
@@ -3246,11 +2460,11 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
@@ -3259,30 +2473,36 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
"+r"(count) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1"
-#endif
);
}
#endif // HAS_COPYROW_SSE2
-#ifdef HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
- size_t width_tmp = (size_t)(width);
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
asm volatile (
- "shr $0x2,%2 \n"
- "rep movsl " MEMMOVESTRING(0,1) " \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
:
: "memory", "cc"
+ , "xmm0", "xmm1"
);
}
-#endif // HAS_COPYROW_X86
+#endif // HAS_COPYROW_AVX
#ifdef HAS_COPYROW_ERMS
-// Unaligned Multiple of 1.
+// Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile (
@@ -3306,19 +2526,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa " MEMACCESS(1) ",%%xmm4 \n"
- "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"pand %%xmm0,%%xmm2 \n"
"pand %%xmm0,%%xmm3 \n"
"pand %%xmm1,%%xmm4 \n"
"pand %%xmm1,%%xmm5 \n"
"por %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -3327,9 +2547,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"+r"(width) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
);
}
#endif // HAS_ARGBCOPYALPHAROW_SSE2
@@ -3358,9 +2576,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
"+r"(width) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
-#endif
);
}
#endif // HAS_ARGBCOPYALPHAROW_AVX2
@@ -3380,16 +2596,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"punpcklbw %%xmm2,%%xmm2 \n"
"punpckhwd %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm2,%%xmm2 \n"
- "movdqa " MEMACCESS(1) ",%%xmm4 \n"
- "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"pand %%xmm0,%%xmm2 \n"
"pand %%xmm0,%%xmm3 \n"
"pand %%xmm1,%%xmm4 \n"
"pand %%xmm1,%%xmm5 \n"
"por %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -3398,9 +2614,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"+r"(width) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
);
}
#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
@@ -3431,18 +2645,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
"+r"(width) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
-#endif
);
}
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint32 v32, int width) {
- size_t width_tmp = (size_t)(width);
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+ size_t width_tmp = (size_t)(width >> 2);
+ const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
asm volatile (
- "shr $0x2,%1 \n"
"rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
@@ -3450,19 +2662,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) {
: "memory", "cc");
}
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- size_t width_tmp = (size_t)(width);
- uint32* d = (uint32*)(dst);
- asm volatile (
- "rep stosl " MEMSTORESTRING(eax,0) " \n"
- : "+D"(d), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
- dst += dst_stride;
- }
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "rep stosb " MEMSTORESTRING(al,0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
}
#endif // HAS_SETROW_X86
@@ -3473,13 +2690,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
@@ -3488,9 +2705,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"+r"(pix) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
-#endif
);
}
@@ -3502,11 +2717,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@@ -3519,7 +2733,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
@@ -3529,13 +2742,8 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"+r"(dst_v), // %2
"+r"(pix) // %3
: "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -3547,8 +2755,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
@@ -3559,7 +2767,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
@@ -3569,47 +2776,36 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
- : "+r"(src_yuy2), // %0
+ : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ , "xmm0", "xmm1"
);
}
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3618,14 +2814,13 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
@@ -3633,28 +2828,22 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
- : "+r"(src_yuy2), // %0
+ : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3664,8 +2853,8 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
@@ -3673,247 +2862,226 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
- : "+r"(src_yuy2), // %0
+ : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
+#endif // HAS_YUY2TOYROW_SSE2
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 1b \n"
- : "+r"(src_uyvy), // %0
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ , "xmm0", "xmm1", "xmm5"
);
}
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
- : "+r"(src_uyvy), // %0
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
- : "+r"(src_uyvy), // %0
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ , "xmm0", "xmm1", "xmm5"
);
}
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
: "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
-#endif // HAS_YUY2TOYROW_SSE2
+#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time.
@@ -3956,9 +3124,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
@@ -3988,9 +3156,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
"jge 41b \n"
"49: \n"
@@ -4019,9 +3187,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
@@ -4030,9 +3198,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"+r"(width) // %3
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
);
}
#endif // HAS_ARGBBLENDROW_SSE2
@@ -4091,49 +3257,18 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
"add $1-4,%3 \n"
"jl 49f \n"
- "test $0xf,%0 \n"
- "jne 41f \n"
- "test $0xf,%1 \n"
- "jne 41f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqa " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqa " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 40b \n"
- "jmp 49f \n"
-
- // 4 pixel unaligned loop.
- LABELALIGN
- "41: \n"
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
@@ -4152,10 +3287,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
- "jge 41b \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
"add $0x3,%3 \n"
@@ -4181,9 +3316,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
@@ -4192,16 +3327,13 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"+r"(width) // %3
: "m"(kShuffleAlpha) // %4
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
);
}
#endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
@@ -4212,17 +3344,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm2 \n"
@@ -4230,18 +3362,16 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"packuswb %%xmm1,%%xmm0 \n"
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
);
}
#endif // HAS_ARGBATTENUATEROW_SSE2
@@ -4249,14 +3379,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
static uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
};
static uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
};
// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm3,%%xmm3 \n"
@@ -4284,9 +3413,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4294,16 +3423,56 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
: "m"(kShuffleAlpha0), // %3
"m"(kShuffleAlpha1) // %4
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
);
}
#endif // HAS_ARGBATTENUATEROW_SSSE3
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha_AVX2) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
uintptr_t alpha = 0;
@@ -4324,7 +3493,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"movzb " MEMACCESS2(0x0b,0) ",%3 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
- BUNDLEALIGN
MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
"movzb " MEMACCESS2(0x0f,0) ",%3 \n"
MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
@@ -4334,26 +3502,90 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pmulhuw %%xmm2,%%xmm1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
"+r"(alpha) // %3
: "r"(fixed_invtbl8) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_ARGBUNATTENUATEROW_SSE2
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ uintptr_t alpha = 0;
+ asm volatile (
+ "sub %0,%1 \n"
+ "vbroadcastf128 %5,%%ymm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ // replace VPGATHER
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
+ "movzb " MEMACCESS2(0x13,0) ",%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
+ "movzb " MEMACCESS2(0x17,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
+ "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
+ "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
+ "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
+ // end of VPGATHER
+
+ "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(alpha) // %3
+ : "r"(fixed_invtbl8), // %4
+ "m"(kUnattenShuffleAlpha_AVX2) // %5
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_AVX2
+
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -4364,16 +3596,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x18,%%xmm2 \n"
"psrld $0x18,%%xmm3 \n"
@@ -4385,10 +3617,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4396,9 +3628,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
: "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
);
}
#endif // HAS_ARGBGRAYROW_SSSE3
@@ -4430,30 +3660,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm6 \n"
"phaddw %%xmm6,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm5 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm5 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm5 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm5 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"psrld $0x18,%%xmm6 \n"
"psrld $0x18,%%xmm1 \n"
"packuswb %%xmm1,%%xmm6 \n"
@@ -4462,10 +3692,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n"
- "sub $0x8,%1 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%1 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@@ -4473,9 +3703,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"m"(kARGBToSepiaG), // %3
"m"(kARGBToSepiaR) // %4
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
);
}
#endif // HAS_ARGBSEPIAROW_SSSE3
@@ -4495,12 +3723,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddsw %%xmm7,%%xmm0 \n"
@@ -4510,13 +3738,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm6,%%xmm6 \n"
"punpcklbw %%xmm6,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm1 \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm6 \n"
@@ -4528,27 +3756,24 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"movdqa %%xmm0,%%xmm6 \n"
"punpcklwd %%xmm1,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm6 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(matrix_argb) // %3
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
);
}
#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
asm volatile (
@@ -4568,23 +3793,23 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n"
- "movdqa " MEMACCESS(0) ",%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n"
"paddw %%xmm4,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n"
- "sub $0x4,%1 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x4,%1 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@@ -4592,16 +3817,13 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"r"(interval_size), // %3
"r"(interval_offset) // %4
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
);
}
#endif // HAS_ARGBQUANTIZEROW_SSE2
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
@@ -4612,7 +3834,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@@ -4622,18 +3844,16 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(value) // %3
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
-#endif
);
}
#endif // HAS_ARGBSHADEROW_SSE2
@@ -4643,7 +3863,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
@@ -4661,9 +3881,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4671,13 +3891,50 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"+r"(width) // %3
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
);
}
#endif // HAS_ARGBMULTIPLYROW_SSE2
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__AVX2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@@ -4691,9 +3948,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4701,13 +3958,39 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"+r"(width) // %3
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1"
-#endif
);
}
#endif // HAS_ARGBADDROW_SSE2
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBADDROW_AVX2
+
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@@ -4721,9 +4004,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"psubusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4731,13 +4014,39 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"+r"(width) // %3
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1"
-#endif
);
}
#endif // HAS_ARGBSUBTRACTROW_SSE2
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
#ifdef HAS_SOBELXROW_SSE2
// SobelX as a matrix is
// -1 0 1
@@ -4759,13 +4068,11 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"punpcklbw %%xmm5,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"psubw %%xmm1,%%xmm0 \n"
- BUNDLEALIGN
MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
"punpcklbw %%xmm5,%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"psubw %%xmm2,%%xmm1 \n"
- BUNDLEALIGN
MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
"punpcklbw %%xmm5,%%xmm2 \n"
@@ -4778,10 +4085,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"psubw %%xmm0,%%xmm1 \n"
"pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "sub $0x8,%4 \n"
- BUNDLEALIGN
MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
"lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -4789,13 +4095,8 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"+r"(dst_sobelx), // %3
"+r"(width) // %4
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SOBELXROW_SSE2
@@ -4820,13 +4121,11 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"punpcklbw %%xmm5,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"psubw %%xmm1,%%xmm0 \n"
- BUNDLEALIGN
"movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
"punpcklbw %%xmm5,%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"psubw %%xmm2,%%xmm1 \n"
- BUNDLEALIGN
"movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
"punpcklbw %%xmm5,%%xmm2 \n"
@@ -4839,23 +4138,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"psubw %%xmm0,%%xmm1 \n"
"pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "sub $0x8,%3 \n"
- BUNDLEALIGN
MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
"lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x8,%3 \n"
"jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SOBELYROW_SSE2
@@ -4876,8 +4169,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -4893,25 +4186,20 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpckhwd %%xmm0,%%xmm0 \n"
"por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm1," MEMACCESS(2) " \n"
- "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
- "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
+ "movdqu %%xmm1," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SOBELROW_SSE2
@@ -4928,26 +4216,21 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
);
}
#endif // HAS_SOBELTOPLANEROW_SSE2
@@ -4967,8 +4250,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"paddusb %%xmm1,%%xmm2 \n"
@@ -4984,25 +4267,20 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"movdqa %%xmm1,%%xmm7 \n"
"punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
- "sub $0x10,%3 \n"
- "movdqa %%xmm6," MEMACCESS(2) " \n"
- "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
- "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
+ "movdqu %%xmm6," MEMACCESS(2) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_SOBELXYROW_SSE2
@@ -5035,22 +4313,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
- "movdqa " MEMACCESS(2) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
+ "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
- "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"paddd %%xmm0,%%xmm5 \n"
- "movdqa %%xmm2," MEMACCESS(1) " \n"
- "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
- "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
- "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
@@ -5082,9 +4360,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"+r"(width) // %3
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
);
}
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
@@ -5115,11 +4391,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel small loop \n"
LABELALIGN
"4: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- BUNDLEALIGN
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
@@ -5129,7 +4404,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
"psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
"psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
"psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
- BUNDLEALIGN
MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
@@ -5149,11 +4423,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel loop \n"
LABELALIGN
"40: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- BUNDLEALIGN
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
@@ -5163,7 +4436,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
"psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
"psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
"psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
- BUNDLEALIGN
MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
@@ -5196,11 +4468,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop \n"
LABELALIGN
"10: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
"lea " MEMLEA(0x10,0) ",%0 \n"
"psubd " MEMACCESS(1) ",%%xmm0 \n"
- BUNDLEALIGN
MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
"lea " MEMLEA(0x10,1) ",%1 \n"
"cvtdq2ps %%xmm0,%%xmm0 \n"
@@ -5219,13 +4490,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
"+rm"(count) // %3
: "r"((intptr_t)(width)), // %4
"rm"(area) // %5
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@@ -5268,7 +4534,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"pshufd $0x39,%%xmm0,%%xmm0 \n"
"movd %%xmm0,%k5 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
- BUNDLEALIGN
MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
"punpckldq %%xmm6,%%xmm1 \n"
@@ -5277,14 +4542,13 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"movd %%xmm0,%k1 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
"movd %%xmm0,%k5 \n"
- BUNDLEALIGN
MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
"punpckldq %%xmm6,%%xmm0 \n"
"addps %%xmm4,%%xmm3 \n"
- "sub $0x4,%4 \n"
"movq %%xmm0," MEMACCESS2(0x08,2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
@@ -5299,11 +4563,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"pmaddwd %%xmm5,%%xmm0 \n"
"addps %%xmm7,%%xmm2 \n"
"movd %%xmm0,%k1 \n"
- BUNDLEALIGN
MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
- "sub $0x1,%4 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x04,2) ",%2 \n"
+ "sub $0x1,%4 \n"
"jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
@@ -5313,13 +4576,8 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"+rm"(width), // %4
"+r"(temp) // %5
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_ARGBAFFINEROW_SSE2
@@ -5352,8 +4610,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// General purpose row blend.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm2)
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2)
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
@@ -5362,61 +4620,57 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
LABELALIGN
"25: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
LABELALIGN
"75: \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm0)
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 100b \n"
"99: \n"
@@ -5425,147 +4679,22 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm5"
);
}
#endif // HAS_INTERPOLATEROW_SSSE3
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
- "sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "psubw %%xmm0,%%xmm2 \n"
- "psubw %%xmm1,%%xmm3 \n"
- "paddw %%xmm2,%%xmm2 \n"
- "paddw %%xmm3,%%xmm3 \n"
- "pmulhw %%xmm5,%%xmm2 \n"
- "pmulhw %%xmm5,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqa " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
"sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n"
@@ -5573,106 +4702,95 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"cmp $0x60,%3 \n"
"je 25f \n"
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm2)
- "movdqu %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
LABELALIGN
"25: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
LABELALIGN
"75: \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm0)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
+ "rep movsb " MEMMOVESTRING(1,0) " \n"
+ "jmp 999f \n"
"99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
+ "vzeroupper \n"
+ "999: \n"
+ : "+D"(dst_ptr), // %0
+ "+S"(src_ptr), // %1
+ "+c"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm5"
);
}
-#endif // HAS_INTERPOLATEROW_SSSE3
+#endif // HAS_INTERPOLATEROW_AVX2
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
"shr %3 \n"
@@ -5699,8 +4817,8 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"1: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
@@ -5714,10 +4832,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
@@ -5728,10 +4845,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 25b \n"
"jmp 99f \n"
@@ -5741,10 +4857,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 50b \n"
"jmp 99f \n"
@@ -5755,10 +4870,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 75b \n"
"jmp 99f \n"
@@ -5766,9 +4880,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
LABELALIGN
"100: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 100b \n"
"99: \n"
@@ -5777,73 +4891,12 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_INTERPOLATEROW_SSE2
-#ifdef HAS_HALFROW_SSE2
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_uv), // %1
- "+r"(pix) // %2
- : "r"((intptr_t)(src_uv_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0"
-#endif
- );
-}
-#endif // HAS_HALFROW_SSE2
-
-#ifdef HAS_ARGBTOBAYERROW_SSSE3
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- // NaCL caveat - assumes movd is from GPR
- "movd %3,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "g"(selector) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBTOBAYERROW_SSSE3
-
#ifdef HAS_ARGBTOBAYERGGROW_SSE2
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
@@ -5852,8 +4905,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"psrld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x8,%%xmm0 \n"
"psrld $0x8,%%xmm1 \n"
@@ -5861,18 +4914,16 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"pand %%xmm5,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x8,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
-#endif
);
}
#endif // HAS_ARGBTOBAYERGGROW_SSE2
@@ -5882,34 +4933,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
- "movdqa " MEMACCESS(3) ",%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
@@ -5917,19 +4941,17 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
- "sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(shuffler) // %3
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
-#endif
);
}
#endif // HAS_ARGBSHUFFLEROW_SSSE3
@@ -5947,19 +4969,18 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "sub $0x10,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
"jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(shuffler) // %3
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
-#endif
);
}
#endif // HAS_ARGBSHUFFLEROW_AVX2
@@ -5989,7 +5010,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"movzb " MEMACCESS2(0x1,4) ",%2 \n"
MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
"mov %b2," MEMACCESS2(0x1,1) " \n"
- BUNDLEALIGN
"movzb " MEMACCESS2(0x2,4) ",%2 \n"
MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
"mov %b2," MEMACCESS2(0x2,1) " \n"
@@ -6014,9 +5034,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x1b,%%xmm1,%%xmm1 \n"
"pshuflw $0x1b,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
"jg 123b \n"
"jmp 99f \n"
@@ -6032,9 +5052,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x39,%%xmm1,%%xmm1 \n"
"pshuflw $0x39,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
"jg 321b \n"
"jmp 99f \n"
@@ -6050,9 +5070,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x93,%%xmm1,%%xmm1 \n"
"pshuflw $0x93,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
"jg 2103b \n"
"jmp 99f \n"
@@ -6068,9 +5088,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0xc6,%%xmm1,%%xmm1 \n"
"pshuflw $0xc6,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
"jg 3012b \n"
"99: \n"
@@ -6079,13 +5099,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"+d"(pixel_temp), // %2
"+r"(pix) // %3
: "r"(shuffler) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
);
}
#endif // HAS_ARGBSHUFFLEROW_SSE2
@@ -6119,13 +5134,8 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
"+r"(dst_frame), // %3
"+rm"(width) // %4
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
);
}
#endif // HAS_I422TOYUY2ROW_SSE2
@@ -6159,13 +5169,8 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
"+r"(dst_frame), // %3
"+rm"(width) // %4
:
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
);
}
#endif // HAS_I422TOUYVYROW_SSE2
@@ -6212,18 +5217,16 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
"cvttps2dq %%xmm4,%%xmm4 \n"
"packuswb %%xmm4,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "sub $0x2,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x2,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(poly) // %3
: "memory", "cc"
-#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
);
}
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
@@ -6253,20 +5256,17 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
"vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
- "sub $0x2,%2 \n"
"vmovq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x2,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(poly) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
-// TODO(fbarchard): declare ymm usage when applicable.
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
@@ -6376,7 +5376,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"movzb " MEMACCESS2(0x4,2) ",%0 \n"
MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
"mov %b0," MEMACCESS2(0x4,3) " \n"
- BUNDLEALIGN
"movzb " MEMACCESS2(0x5,2) ",%0 \n"
MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
"mov %b0," MEMACCESS2(0x5,3) " \n"
@@ -6416,9 +5415,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"mov %b0," MEMACCESS2(0xe,3) " \n"
"movzb " MEMACCESS2(0xf,2) ",%0 \n"
"mov %b0," MEMACCESS2(0xf,3) " \n"
- "sub $0x4,%4 \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"lea " MEMLEA(0x10,3) ",%3 \n"
+ "sub $0x4,%4 \n"
"jg 1b \n"
: "+d"(pixel_temp), // %0
"+a"(table_temp), // %1
@@ -6427,10 +5426,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"+rm"(width) // %4
: "r"(luma), // %5
"rm"(lumacoeff) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm3", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc
index d79c353960b..6e9d04c0e4e 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc
@@ -24,55 +24,63 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
(defined(_M_IX86) || defined(_M_X64))
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(127,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static const vec8 kUVToB = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+struct YuvConstants {
+ lvec8 kUVToB; // 0
+ lvec8 kUVToG; // 32
+ lvec8 kUVToR; // 64
+ lvec16 kUVBiasB; // 96
+ lvec16 kUVBiasG; // 128
+ lvec16 kUVBiasR; // 160
+ lvec16 kYToRgb; // 192
};
-static const vec8 kVUToB = {
- VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
};
-static const vec8 kVUToR = {
- VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
};
-static const vec8 kVUToG = {
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
// 64 bit
#if defined(_M_X64)
-// Aligned destination version.
__declspec(align(16))
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -81,7 +89,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
int width) {
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
- const __m128i xmm4 = _mm_setzero_si128();
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
@@ -89,18 +96,17 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
- xmm1 = _mm_load_si128(&xmm0);
- xmm2 = _mm_load_si128(&xmm0);
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
- xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
- xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
- xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+ xmm1 = _mm_loadu_si128(&xmm0);
+ xmm2 = _mm_loadu_si128(&xmm0);
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+ xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+ xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+ xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
- xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
- xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
- xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+ xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
xmm0 = _mm_adds_epi16(xmm0, xmm3);
xmm1 = _mm_adds_epi16(xmm1, xmm3);
xmm2 = _mm_adds_epi16(xmm2, xmm3);
@@ -112,60 +118,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
xmm2 = _mm_packus_epi16(xmm2, xmm2);
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
- xmm1 = _mm_load_si128(&xmm0);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
- _mm_store_si128((__m128i *)dst_argb, xmm0);
- _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
-
- y_buf += 8;
- u_buf += 4;
- dst_argb += 32;
- width -= 8;
- }
-}
-
-// Unaligned destination version.
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm3;
- const __m128i xmm5 = _mm_set1_epi8(-1);
- const __m128i xmm4 = _mm_setzero_si128();
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-
- while (width > 0) {
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
- xmm1 = _mm_load_si128(&xmm0);
- xmm2 = _mm_load_si128(&xmm0);
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
- xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
- xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
- xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
- xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
- xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
- xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
- xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
- xmm0 = _mm_adds_epi16(xmm0, xmm3);
- xmm1 = _mm_adds_epi16(xmm1, xmm3);
- xmm2 = _mm_adds_epi16(xmm2, xmm3);
- xmm0 = _mm_srai_epi16(xmm0, 6);
- xmm1 = _mm_srai_epi16(xmm1, 6);
- xmm2 = _mm_srai_epi16(xmm2, 6);
- xmm0 = _mm_packus_epi16(xmm0, xmm0);
- xmm1 = _mm_packus_epi16(xmm1, xmm1);
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
- xmm1 = _mm_load_si128(&xmm0);
+ xmm1 = _mm_loadu_si128(&xmm0);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
@@ -178,6 +131,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
width -= 8;
}
}
+
// 32 bit
#else // defined(_M_X64)
@@ -209,15 +163,10 @@ static const vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
};
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
- 0, 4, 1, 5, 2, 6, 3, 7
-};
-
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
};
// Constants for BGRA.
@@ -263,6 +212,7 @@ static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
+// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
@@ -316,36 +266,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- align 4
- convertloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm5
- por xmm1, xmm5
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
-
- align 4
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
@@ -374,7 +294,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
pslld xmm5, 24
movdqa xmm4, kShuffleMaskRGB24ToARGB
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -386,18 +305,18 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
+ movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
+ movdqu [edx + 16], xmm1
por xmm3, xmm5
- sub ecx, 16
- movdqa [edx + 48], xmm3
+ movdqu [edx + 48], xmm3
lea edx, [edx + 64]
+ sub ecx, 16
jg convertloop
ret
}
@@ -414,7 +333,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
pslld xmm5, 24
movdqa xmm4, kShuffleMaskRAWToARGB
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -426,18 +344,18 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
+ movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
+ movdqu [edx + 16], xmm1
por xmm3, xmm5
- sub ecx, 16
- movdqa [edx + 48], xmm3
+ movdqu [edx + 48], xmm3
lea edx, [edx + 64]
+ sub ecx, 16
jg convertloop
ret
}
@@ -474,7 +392,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
sub edx, eax
sub edx, eax
- align 4
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0
@@ -491,8 +408,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -524,7 +441,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
sub edx, eax
sub edx, eax
- align 4
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0
@@ -545,8 +461,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -570,7 +486,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
sub edx, eax
sub edx, eax
- align 4
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0
@@ -585,8 +500,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -602,7 +517,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRGB24
- align 4
convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16]
@@ -641,7 +555,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRAW
- align 4
convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16]
@@ -686,9 +599,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
- align 4
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
pslld xmm0, 8 // R
@@ -726,9 +638,8 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15
- align 4
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
movdqa xmm3, xmm0 // R
@@ -764,14 +675,13 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8
- align 4
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble
- psrl xmm0, 4
- psrl xmm1, 8
+ psrld xmm0, 4
+ psrld xmm1, 8
por xmm0, xmm1
packuswb xmm0, xmm0
lea eax, [eax + 16]
@@ -783,6 +693,116 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800
+ vpslld ymm5, ymm5, 11
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpslld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpsrad ymm0, ymm0, 16 // R
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackssdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ vpcmpeqb ymm4, ymm4, ymm4
+ vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
+ vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
+ vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
+ vpslld ymm7, ymm7, 15
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm3, ymm0, 9 // R
+ vpsrld ymm2, ymm0, 6 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrad ymm0, ymm0, 16 // A
+ vpand ymm3, ymm3, ymm6 // R
+ vpand ymm2, ymm2, ymm5 // G
+ vpand ymm1, ymm1, ymm4 // B
+ vpand ymm0, ymm0, ymm7 // A
+ vpor ymm0, ymm0, ymm1 // BA
+ vpor ymm2, ymm2, ymm3 // GR
+ vpor ymm0, ymm0, ymm2 // BGRA
+ vpackssdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
+ vpsllw ymm4, ymm4, 12
+ vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpand ymm1, ymm0, ymm4 // high nibble
+ vpand ymm0, ymm0, ymm3 // low nibble
+ vpsrld ymm1, ymm1, 8
+ vpsrld ymm0, ymm0, 4
+ vpor ymm0, ymm0, ymm1
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB4444ROW_AVX2
+
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
__declspec(naked) __declspec(align(16))
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
@@ -790,15 +810,14 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY
+ movdqa xmm5, kAddY16
- align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -810,15 +829,16 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
__declspec(naked) __declspec(align(16))
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
@@ -828,12 +848,11 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64
- align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -846,15 +865,20 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+ 0, 4, 1, 5, 2, 6, 3, 7
+};
+
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
__declspec(naked) __declspec(align(32))
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
@@ -864,9 +888,8 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToY
vbroadcastf128 ymm5, kAddY16
- vmovdqa ymm6, kPermdARGBToY_AVX
+ vmovdqu ymm6, kPermdARGBToY_AVX
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -883,10 +906,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- vpaddb ymm0, ymm0, ymm5
- sub ecx, 32
+ vpaddb ymm0, ymm0, ymm5 // add 16 for Y
vmovdqu [edx], ymm0
lea edx, [edx + 32]
+ sub ecx, 32
jg convertloop
vzeroupper
ret
@@ -904,9 +927,8 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToYJ
vbroadcastf128 ymm5, kAddYJ64
- vmovdqa ymm6, kPermdARGBToY_AVX
+ vmovdqu ymm6, kPermdARGBToY_AVX
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -925,9 +947,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
- sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
+ sub ecx, 32
jg convertloop
vzeroupper
@@ -937,118 +959,14 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
#endif // HAS_ARGBTOYJROW_AVX2
__declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kARGBToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm4, kARGBToYJ
- movdqa xmm5, kAddYJ64
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- paddw xmm0, xmm5
- paddw xmm2, xmm5
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
- movdqa xmm4, kBGRAToY
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -1065,9 +983,9 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
@@ -1079,44 +997,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
- movdqa xmm4, kABGRToY
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -1133,9 +1016,9 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
@@ -1147,44 +1030,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
movdqa xmm4, kRGBAToY
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
- movdqa xmm4, kRGBAToY
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -1201,9 +1049,9 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
@@ -1220,22 +1068,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
movdqa xmm5, kAddUV128
+ movdqa xmm6, kARGBToV
+ movdqa xmm7, kARGBToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1263,10 +1115,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1286,22 +1138,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToUJ
- movdqa xmm6, kARGBToVJ
movdqa xmm5, kAddUVJ128
+ movdqa xmm6, kARGBToVJ
+ movdqa xmm7, kARGBToUJ
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1330,10 +1186,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1359,7 +1215,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vbroadcastf128 ymm7, kARGBToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax]
@@ -1395,10 +1250,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
- sub ecx, 32
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
+ sub ecx, 32
jg convertloop
pop edi
@@ -1410,147 +1265,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToUJ
- movdqa xmm6, kARGBToVJ
- movdqa xmm5, kAddUVJ128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
- paddw xmm1, xmm5
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1559,70 +1273,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* convert to U and V */
- movdqa xmm0, [eax] // U
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
-
- movdqa xmm0, [eax] // V
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm6
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm2, xmm6
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- lea eax, [eax + 64]
- movdqa [edx + edi], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
+ movdqa xmm7, kARGBToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* convert to U and V */
movdqu xmm0, [eax] // U
@@ -1639,7 +1294,6 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
psraw xmm2, 8
packsswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
movdqu xmm0, [eax] // V
@@ -1659,6 +1313,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
lea eax, [eax + 64]
movdqu [edx + edi], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1675,18 +1330,17 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
movdqa xmm5, kAddUV128
+ movdqa xmm6, kARGBToV
+ movdqa xmm7, kARGBToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1714,10 +1368,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1726,26 +1380,36 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
}
__declspec(naked) __declspec(align(16))
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
+ push esi
push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
movdqa xmm5, kAddUV128
+ movdqa xmm6, kBGRAToV
+ movdqa xmm7, kBGRAToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1773,19 +1437,20 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
+ pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
@@ -1795,22 +1460,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
movdqa xmm5, kAddUV128
+ movdqa xmm6, kABGRToV
+ movdqa xmm7, kABGRToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1838,10 +1507,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1851,8 +1520,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
@@ -1861,26 +1530,26 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
movdqa xmm5, kAddUV128
+ movdqa xmm6, kRGBAToV
+ movdqa xmm7, kRGBAToU
sub edi, edx // stride from u to v
- align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1908,10 +1577,10 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1919,314 +1588,263 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
ret
}
}
+#endif // HAS_ARGBTOYROW_SSSE3
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm { \
+ /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
+ __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \
+ __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \
+ __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \
+ __asm vmovdqu ymm3, YuvConstants.kUVBiasR \
+ __asm vpsubw ymm2, ymm3, ymm2 \
+ __asm vmovdqu ymm3, YuvConstants.kUVBiasG \
+ __asm vpsubw ymm1, ymm3, ymm1 \
+ __asm vmovdqu ymm3, YuvConstants.kUVBiasB \
+ __asm vpsubw ymm0, ymm3, ymm0 \
+ /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm vmovdqu xmm3, [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 16] \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklbw ymm3, ymm3, ymm3 \
+ __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \
+ __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
+ __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
+ __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
+ __asm vpsraw ymm0, ymm0, 6 \
+ __asm vpsraw ymm1, ymm1, 6 \
+ __asm vpsraw ymm2, ymm2, 6 \
+ __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
+ __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
+ __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
+ }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm { \
+ /* Step 3: Weave into ARGB */ \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vmovdqu [edx], ymm1 \
+ __asm vmovdqu [edx + 32], ymm0 \
+ __asm lea edx, [edx + 64] \
+ }
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- align 4
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ STOREARGB_AVX2
- // step 3 - store 8 U and 8 V values
sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
+ vzeroupper
ret
}
}
+#endif // HAS_I422TOARGBROW_AVX2
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- align 4
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
+ READNV12_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ STOREARGB_AVX2
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV12_AVX2
+ YUVTORGB_AVX2(kYvuConstants)
+ STOREARGB_AVX2
- // step 3 - store 8 U and 8 V values
sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
jg convertloop
- pop edi
pop esi
ret
}
}
+#endif // HAS_NV21TOARGBROW_AVX2
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- align 4
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
- // step 3 - store 8 U and 8 V values
+ // Step 3: Weave into BGRA
+ vpunpcklbw ymm1, ymm1, ymm0 // GB
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklbw ymm2, ymm5, ymm2 // AR
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels
+ vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
+ vzeroupper
ret
}
}
+#endif // HAS_I422TOBGRAROW_AVX2
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- align 4
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
+ // Step 3: Weave into RGBA
+ vpunpcklbw ymm1, ymm1, ymm2 // GR
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklbw ymm2, ymm5, ymm0 // AB
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels
+ vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
+ vzeroupper
ret
}
}
-#endif // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_I422TOARGBROW_AVX2
-
-static const lvec8 kUVToB_AVX = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-static const lvec8 kUVToR_AVX = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-static const lvec8 kUVToG_AVX = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-static const lvec16 kYToRgb_AVX = {
- YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
-};
-static const lvec16 kYSub16_AVX = {
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-};
-static const lvec16 kUVBiasB_AVX = {
- BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
-};
-static const lvec16 kUVBiasG_AVX = {
- BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
-};
-static const lvec16 kUVBiasR_AVX = {
- BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
-};
+#endif // HAS_I422TORGBAROW_AVX2
+#ifdef HAS_I422TOABGRROW_AVX2
// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_AVX2(const uint8* y_buf,
+void I422ToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
@@ -2241,63 +1859,33 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width
sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
- vpxor ymm4, ymm4, ymm4
- align 4
convertloop:
- vmovq xmm0, qword ptr [esi] // U
- vmovq xmm1, qword ptr [esi + edi] // V
- lea esi, [esi + 8]
- vpunpcklbw ymm0, ymm0, ymm1 // UV
- vpermq ymm0, ymm0, 0xd8
- vpunpcklwd ymm0, ymm0, ymm0 // UVUV
- vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
- vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
- vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
- vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
- vpsubw ymm1, ymm1, kUVBiasG_AVX
- vpsubw ymm0, ymm0, kUVBiasR_AVX
-
- // Step 2: Find Y contribution to 16 R,G,B values
- vmovdqu xmm3, [eax] // NOLINT
- lea eax, [eax + 16]
- vpermq ymm3, ymm3, 0xd8
- vpunpcklbw ymm3, ymm3, ymm4
- vpsubsw ymm3, ymm3, kYSub16_AVX
- vpmullw ymm3, ymm3, kYToRgb_AVX
- vpaddsw ymm2, ymm2, ymm3 // B += Y
- vpaddsw ymm1, ymm1, ymm3 // G += Y
- vpaddsw ymm0, ymm0, ymm3 // R += Y
- vpsraw ymm2, ymm2, 6
- vpsraw ymm1, ymm1, 6
- vpsraw ymm0, ymm0, 6
- vpackuswb ymm2, ymm2, ymm2 // B
- vpackuswb ymm1, ymm1, ymm1 // G
- vpackuswb ymm0, ymm0, ymm0 // R
-
- // Step 3: Weave into ARGB
- vpunpcklbw ymm2, ymm2, ymm1 // BG
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into ABGR
+ vpunpcklbw ymm1, ymm2, ymm1 // RG
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklbw ymm2, ymm0, ymm5 // BA
vpermq ymm2, ymm2, 0xd8
- vpunpcklbw ymm0, ymm0, ymm5 // RA
- vpermq ymm0, ymm0, 0xd8
- vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
- vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
+ vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
- vzeroupper
pop edi
pop esi
+ vzeroupper
ret
}
}
-#endif // HAS_I422TOARGBROW_AVX2
-
-#ifdef HAS_I422TOARGBROW_SSSE3
+#endif // HAS_I422TOABGRROW_AVX2
+#if defined(HAS_I422TOARGBROW_SSSE3)
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
// Read 8 UV from 444.
@@ -2337,22 +1925,25 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
}
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB __asm { \
+#define YUVTORGB(YuvConstants) __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
- __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
- __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
- __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
- __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
- __asm psubw xmm1, kUVBiasG \
- __asm psubw xmm2, kUVBiasR \
+ __asm movdqa xmm3, xmm0 \
+ __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \
+ __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \
+ __asm psubw xmm0, xmm1 \
+ __asm movdqa xmm1, YuvConstants.kUVBiasG \
+ __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \
+ __asm psubw xmm1, xmm2 \
+ __asm movdqa xmm2, YuvConstants.kUVBiasR \
+ __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \
+ __asm psubw xmm2, xmm3 \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \
- __asm punpcklbw xmm3, xmm4 \
- __asm psubsw xmm3, kYSub16 \
- __asm pmullw xmm3, kYToRgb \
+ __asm punpcklbw xmm3, xmm3 \
+ __asm pmulhuw xmm3, YuvConstants.kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \
@@ -2364,35 +1955,131 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
__asm packuswb xmm2, xmm2 /* R */ \
}
-// Convert 8 pixels: 8 VU and 8 Y.
-#define YVUTORGB __asm { \
- /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+// Store 8 ARGB values.
+#define STOREARGB __asm { \
+ /* Step 3: Weave into ARGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
- __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
- __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
- __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
- __asm psubw xmm1, kUVBiasG \
- __asm psubw xmm2, kUVBiasR \
- /* Step 2: Find Y contribution to 8 R,G,B values */ \
- __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
- __asm lea eax, [eax + 8] \
- __asm punpcklbw xmm3, xmm4 \
- __asm psubsw xmm3, kYSub16 \
- __asm pmullw xmm3, kYToRgb \
- __asm paddsw xmm0, xmm3 /* B += Y */ \
- __asm paddsw xmm1, xmm3 /* G += Y */ \
- __asm paddsw xmm2, xmm3 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
- }
-
-// 8 pixels, dest aligned 16.
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm movdqu [edx], xmm0 \
+ __asm movdqu [edx + 16], xmm1 \
+ __asm lea edx, [edx + 32] \
+ }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm { \
+ /* Step 3: Weave into BGRA */ \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm movdqu [edx], xmm5 \
+ __asm movdqu [edx + 16], xmm0 \
+ __asm lea edx, [edx + 32] \
+ }
+
+// Store 8 ABGR values.
+#define STOREABGR __asm { \
+ /* Step 3: Weave into ABGR */ \
+ __asm punpcklbw xmm2, xmm1 /* RG */ \
+ __asm punpcklbw xmm0, xmm5 /* BA */ \
+ __asm movdqa xmm1, xmm2 \
+ __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
+ __asm movdqu [edx], xmm2 \
+ __asm movdqu [edx + 16], xmm1 \
+ __asm lea edx, [edx + 32] \
+ }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm { \
+ /* Step 3: Weave into RGBA */ \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm movdqu [edx], xmm5 \
+ __asm movdqu [edx + 16], xmm0 \
+ __asm lea edx, [edx + 32] \
+ }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm { \
+ /* Step 3: Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
+ /* Step 4: RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24] \
+ }
+
+// Store 8 RAW values.
+#define STORERAW __asm { \
+ /* Step 3: Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
+ /* Step 4: RRGB -> RAW */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24] \
+ }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm { \
+ /* Step 3: Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
+ /* Step 4: RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
+ __asm packssdw xmm0, xmm1 \
+ __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm lea edx, [edx + 16] \
+ }
+
+// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I444ToARGBRow_SSSE3(const uint8* y_buf,
@@ -2410,22 +2097,12 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 4
convertloop:
READYUV444
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2435,8 +2112,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -2452,27 +2129,14 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgb24
mov ecx, [esp + 8 + 20] // width
sub edi, esi
- pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24
- align 4
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STORERGB24
- // Step 3: Weave into RRGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm2 // RR
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRR first 4 pixels
- punpckhwd xmm1, xmm2 // BGRR next 4 pixels
- pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
- pshufb xmm1, xmm6 // Pack into first 12 bytes.
- palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
- movq qword ptr [edx], xmm0 // First 8 bytes
- movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
- lea edx, [edx + 24]
sub ecx, 8
jg convertloop
@@ -2482,8 +2146,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -2499,27 +2163,14 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // raw
mov ecx, [esp + 8 + 20] // width
sub edi, esi
- pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW
- align 4
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STORERAW
- // Step 3: Weave into RRGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm2 // RR
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRR first 4 pixels
- punpckhwd xmm1, xmm2 // BGRR next 4 pixels
- pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
- pshufb xmm1, xmm6 // Pack into first 12 bytes.
- palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
- movq qword ptr [edx], xmm0 // First 8 bytes
- movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
- lea edx, [edx + 24]
sub ecx, 8
jg convertloop
@@ -2529,8 +2180,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -2546,7 +2197,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgb565
mov ecx, [esp + 8 + 20] // width
sub edi, esi
- pxor xmm4, xmm4
pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
psrld xmm5, 27
pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
@@ -2555,45 +2205,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
pslld xmm7, 11
- align 4
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STORERGB565
- // Step 3: Weave into RRGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm2 // RR
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRR first 4 pixels
- punpckhwd xmm1, xmm2 // BGRR next 4 pixels
-
- // Step 3b: RRGB -> RGB565
- movdqa xmm3, xmm0 // B first 4 pixels of argb
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm3, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm3, xmm5 // B
- pand xmm2, xmm6 // G
- pand xmm0, xmm7 // R
- por xmm3, xmm2 // BG
- por xmm0, xmm3 // BGR
- movdqa xmm3, xmm1 // B next 4 pixels of argb
- movdqa xmm2, xmm1 // G
- pslld xmm1, 8 // R
- psrld xmm3, 3 // B
- psrld xmm2, 5 // G
- psrad xmm1, 16 // R
- pand xmm3, xmm5 // B
- pand xmm2, xmm6 // G
- pand xmm1, xmm7 // R
- por xmm3, xmm2 // BG
- por xmm1, xmm3 // BGR
- packssdw xmm0, xmm1
sub ecx, 8
- movdqu [edx], xmm0 // store 8 pixels of RGB565
- lea edx, [edx + 16]
jg convertloop
pop edi
@@ -2602,7 +2219,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToARGBRow_SSSE3(const uint8* y_buf,
@@ -2620,22 +2237,12 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 4
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2645,7 +2252,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more.
__declspec(naked) __declspec(align(16))
@@ -2664,23 +2271,13 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 12 + 16] // argb
mov ecx, [esp + 12 + 20] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- align 4
convertloop:
READYUV411 // modifies EBX
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2691,7 +2288,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
@@ -2705,22 +2302,12 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 4
convertloop:
READNV12
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2729,8 +2316,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels.
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
@@ -2739,234 +2326,16 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
__asm {
push esi
mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV444
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ecx, [esp + 12 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV411 // modifies EBX
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
mov esi, [esp + 4 + 8] // UV
mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 4
convertloop:
READNV12
- YUVTORGB
+ YUVTORGB(kYvuConstants)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -2990,64 +2359,12 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // bgra
mov ecx, [esp + 8 + 20] // width
sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into BGRA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // bgra
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
- align 4
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STOREBGRA
- // Step 3: Weave into BGRA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3073,63 +2390,12 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 4
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(kYuvConstants)
+ STOREABGR
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // abgr
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3154,64 +2420,12 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgba
mov ecx, [esp + 8 + 20] // width
sub edi, esi
- pxor xmm4, xmm4
- align 4
convertloop:
READYUV422
- YUVTORGB
-
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
+ YUVTORGB(kYuvConstants)
+ STORERGBA
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgba
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3224,32 +2438,32 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
- pxor xmm5, xmm5
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- mov eax, 0x00100010
- movd xmm3, eax
- pshufd xmm3, xmm3, 0
- mov eax, 0x004a004a // 74
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
movd xmm2, eax
pshufd xmm2, xmm2,0
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
- align 4
convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
- punpcklbw xmm0, xmm5 // 0.Y
+ punpcklbw xmm0, xmm0 // Y.Y
+ pmulhuw xmm0, xmm2
psubusw xmm0, xmm3
- pmullw xmm0, xmm2
psrlw xmm0, 6
packuswb xmm0, xmm0 // G
@@ -3260,23 +2474,74 @@ void YToARGBRow_SSE2(const uint8* y_buf,
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm4
por xmm1, xmm4
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
-
ret
}
}
#endif // HAS_YTOARGBROW_SSE2
+#ifdef HAS_YTOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked) __declspec(align(16))
+void YToARGBRow_AVX2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ vmovd xmm2, eax
+ vbroadcastss ymm2, xmm2
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ vmovd xmm3, eax
+ vbroadcastss ymm3, xmm3
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
+ vpslld ymm4, ymm4, 24
+
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
+ vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
+ vpmulhuw ymm0, ymm0, ymm2
+ vpsubusw ymm0, ymm0, ymm3
+ vpsrlw ymm0, ymm0, 6
+ vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
+
+ // TODO(fbarchard): Weave alpha with unpack.
+ // Step 2: Weave into ARGB
+ vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm4
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YTOARGBROW_AVX2
+
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
};
+// TODO(fbarchard): Replace lea with -16 offset.
__declspec(naked) __declspec(align(16))
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
__asm {
@@ -3284,15 +2549,13 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
movdqa xmm5, kShuffleMirror
- lea eax, [eax - 16]
- align 4
convertloop:
- movdqa xmm0, [eax + ecx]
+ movdqu xmm0, [eax - 16 + ecx]
pshufb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
@@ -3300,29 +2563,21 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec8 kShuffleMirror_AVX2 = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
__declspec(naked) __declspec(align(16))
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- vmovdqa ymm5, kShuffleMirror_AVX2
- lea eax, [eax - 32]
+ vbroadcastf128 ymm5, kShuffleMirror
- align 4
convertloop:
- vmovdqu ymm0, [eax + ecx]
+ vmovdqu ymm0, [eax - 32 + ecx]
vpshufb ymm0, ymm0, ymm5
vpermq ymm0, ymm0, 0x4e // swap high and low halfs
- sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
+ sub ecx, 32
jg convertloop
vzeroupper
ret
@@ -3331,19 +2586,15 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
__declspec(naked) __declspec(align(16))
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- lea eax, [eax - 16]
- align 4
convertloop:
- movdqu xmm0, [eax + ecx]
+ movdqu xmm0, [eax - 16 + ecx]
movdqa xmm1, xmm0 // swap bytes
psllw xmm0, 8
psrlw xmm1, 8
@@ -3351,9 +2602,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e // swap qwords
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
@@ -3379,15 +2630,14 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
- align 4
convertloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm1
- sub ecx, 8
movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8]
+ sub ecx, 8
jg convertloop
pop edi
@@ -3396,34 +2646,27 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
}
#endif // HAS_MIRRORROW_UV_SSSE3
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
- 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
+#ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
- movdqa xmm5, kARGBShuffleMirror
- align 4
convertloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax - 16]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [edx], xmm0
+ pshufd xmm0, xmm0, 0x1b
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
ret
}
}
-#endif // HAS_ARGBMIRRORROW_SSSE3
+#endif // HAS_ARGBMIRRORROW_SSE2
#ifdef HAS_ARGBMIRRORROW_AVX2
// Shuffle table for reversing the bytes.
@@ -3437,15 +2680,13 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- lea eax, [eax - 32]
- vmovdqa ymm5, kARGBShuffleMirror_AVX2
+ vmovdqu ymm5, kARGBShuffleMirror_AVX2
- align 4
convertloop:
- vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
- sub ecx, 8
+ vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
vmovdqu [edx], ymm0
lea edx, [edx + 32]
+ sub ecx, 8
jg convertloop
vzeroupper
ret
@@ -3466,44 +2707,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
psrlw xmm5, 8
sub edi, edx
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqa [edx], xmm0
- movdqa [edx + edi], xmm2
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -3526,6 +2729,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
ret
}
}
+
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2
@@ -3541,7 +2745,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3579,37 +2782,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
mov ecx, [esp + 4 + 16] // width
sub edx, eax
- align 4
- convertloop:
- movdqa xmm0, [eax] // read 16 U's
- movdqa xmm1, [eax + edx] // and 16 V's
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
-
- align 4
convertloop:
movdqu xmm0, [eax] // read 16 U's
movdqu xmm1, [eax + edx] // and 16 V's
@@ -3641,17 +2813,16 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
mov ecx, [esp + 4 + 16] // width
sub edx, eax
- align 4
convertloop:
vmovdqu ymm0, [eax] // read 32 U's
vmovdqu ymm1, [eax + edx] // and 32 V's
lea eax, [eax + 32]
vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
- vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
- vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
- vmovdqu [edi], ymm1
- vmovdqu [edi + 32], ymm2
+ vextractf128 [edi], ymm2, 0 // bytes 0..15
+ vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
+ vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
+ vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
lea edi, [edi + 64]
sub ecx, 32
jg convertloop
@@ -3672,13 +2843,12 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- align 4
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
@@ -3687,39 +2857,46 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_SSE2
-// Unaligned Multiple of 1.
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
__declspec(naked) __declspec(align(16))
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
__asm {
- mov eax, esi
- mov edx, edi
- mov esi, [esp + 4] // src
- mov edi, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- rep movsb
- mov edi, edx
- mov esi, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 64
+ jg convertloop
+
+ vzeroupper
ret
}
}
+#endif // HAS_COPYROW_AVX
-#ifdef HAS_COPYROW_X86
+// Multiple of 1.
__declspec(naked) __declspec(align(16))
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
mov ecx, [esp + 12] // count
- shr ecx, 2
- rep movsd
+ rep movsb
mov edi, edx
mov esi, eax
ret
}
}
-#endif // HAS_COPYROW_X86
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
@@ -3734,21 +2911,20 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8
- align 4
convertloop:
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + 16]
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
- movdqa xmm4, [edx]
- movdqa xmm5, [edx + 16]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
pand xmm2, xmm0
pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3769,7 +2945,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
- align 4
convertloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + 32]
@@ -3801,23 +2976,22 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8
- align 4
convertloop:
movq xmm2, qword ptr [eax] // 8 Y's
lea eax, [eax + 8]
punpcklbw xmm2, xmm2
punpckhwd xmm3, xmm2
punpcklwd xmm2, xmm2
- movdqa xmm4, [edx]
- movdqa xmm5, [edx + 16]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
pand xmm2, xmm0
pand xmm3, xmm0
pand xmm4, xmm1
pand xmm5, xmm1
por xmm2, xmm4
por xmm3, xmm5
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -3838,7 +3012,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
- align 4
convertloop:
vpmovzxbd ymm1, qword ptr [eax]
vpmovzxbd ymm2, qword ptr [eax + 8]
@@ -3860,13 +3033,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
__declspec(naked) __declspec(align(16))
-void SetRow_X86(uint8* dst, uint32 v32, int count) {
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm {
+ movzx eax, byte ptr [esp + 8] // v8
+ mov edx, 0x01010101 // Duplicate byte to all bytes.
+ mul edx // overwrites edx with upper part of result.
mov edx, edi
mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
@@ -3875,33 +3051,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
}
}
-// SetRow32 writes 'count' words using a 32 bit value repeated.
+// Write 'count' bytes using an 8 bit value repeated.
__declspec(naked) __declspec(align(16))
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
__asm {
- push esi
- push edi
- push ebp
- mov edi, [esp + 12 + 4] // dst
- mov eax, [esp + 12 + 8] // v32
- mov ebp, [esp + 12 + 12] // width
- mov edx, [esp + 12 + 16] // dst_stride
- mov esi, [esp + 12 + 20] // height
- lea ecx, [ebp * 4]
- sub edx, ecx // stride - width * 4
-
- align 4
- convertloop:
- mov ecx, ebp
- rep stosd
- add edi, edx
- sub esi, 1
- jg convertloop
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v8
+ mov ecx, [esp + 12] // count
+ rep stosb
+ mov edi, edx
+ ret
+ }
+}
- pop ebp
- pop edi
- pop esi
+// Write 'count' 32 bit values.
+__declspec(naked) __declspec(align(16))
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // count
+ rep stosd
+ mov edi, edx
ret
}
}
@@ -3918,7 +3091,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3927,9 +3099,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
- sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
+ sub ecx, 32
jg convertloop
vzeroupper
ret
@@ -3951,7 +3123,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3994,7 +3165,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -4029,7 +3199,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -4038,12 +3207,12 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
- sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
+ sub ecx, 32
jg convertloop
- ret
vzeroupper
+ ret
}
}
@@ -4062,7 +3231,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -4105,7 +3273,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -4144,114 +3311,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4259,17 +3318,17 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
@@ -4282,7 +3341,6 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm5, 8
sub edi, edx
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4312,8 +3370,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
}
__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
@@ -4324,7 +3382,6 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
psrlw xmm5, 8
sub edi, edx
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4356,112 +3413,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4469,17 +3420,17 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
@@ -4492,7 +3443,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
psrlw xmm5, 8
sub edi, edx
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4522,8 +3472,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
}
__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
@@ -4534,7 +3484,6 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
psrlw xmm5, 8
sub edi, edx
- align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4607,9 +3556,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge alignloop1
alignloop1b:
@@ -4638,9 +3587,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jge convertloop4
convertloop4b:
@@ -4669,9 +3618,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge convertloop1
convertloop1b:
@@ -4739,48 +3688,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge alignloop1
alignloop1b:
add ecx, 1 - 4
jl convertloop4b
- test eax, 15 // unaligned?
- jne convertuloop4
- test esi, 15 // unaligned?
- jne convertuloop4
-
// 4 pixel loop.
convertloop4:
- movdqa xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqa xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqa xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jge convertloop4
- jmp convertloop4b
-
- // 4 pixel unaligned loop.
- convertuloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
@@ -4799,10 +3717,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
- jge convertuloop4
+ sub ecx, 4
+ jge convertloop4
convertloop4b:
add ecx, 4 - 1
@@ -4828,9 +3746,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge convertloop1
convertloop1b:
@@ -4842,7 +3760,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
-// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
@@ -4854,19 +3771,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
psrld xmm5, 8
- align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2
pshufhw xmm2, xmm0, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm0, xmm2 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm1 // next 2 pixels
pshufhw xmm2, xmm1, 0FFh // 8 alpha words
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a
- movdqa xmm2, [eax] // alphas
+ movdqu xmm2, [eax] // alphas
lea eax, [eax + 16]
psrlw xmm0, 8
pand xmm2, xmm4
@@ -4874,9 +3790,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
packuswb xmm0, xmm1
pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
ret
@@ -4904,7 +3820,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm4, kShuffleAlpha0
movdqa xmm5, kShuffleAlpha1
- align 4
convertloop:
movdqu xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas
@@ -4923,9 +3838,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
psrlw xmm1, 8
packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
ret
@@ -4935,11 +3850,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
-static const ulvec8 kShuffleAlpha_AVX2 = {
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
- 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
- 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+static const uvec8 kShuffleAlpha_AVX2 = {
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
};
__declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -4948,11 +3860,10 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
- vmovdqa ymm4, kShuffleAlpha_AVX2
+ vbroadcastf128 ymm4,kShuffleAlpha_AVX2
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
- align 4
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
@@ -4966,9 +3877,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // unmutated.
vpor ymm0, ymm0, ymm6 // copy original alpha
- sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
+ sub ecx, 8
jg convertloop
vzeroupper
@@ -4979,7 +3890,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
@@ -4990,7 +3900,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
mov edx, [esp + 8 + 8] // dst_argb
mov ecx, [esp + 8 + 12] // width
- align 4
convertloop:
movdqu xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha
@@ -5016,9 +3925,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 16]
packuswb xmm0, xmm1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
pop edi
pop esi
@@ -5029,9 +3938,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
-static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
};
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
// USE_GATHER is not on by default, due to being a slow instruction.
@@ -5044,9 +3952,8 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
- vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
+ vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
- align 4
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
@@ -5061,9 +3968,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpackuswb ymm0, ymm0, ymm1 // unmutated.
- sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
+ sub ecx, 8
jg convertloop
vzeroupper
@@ -5080,12 +3987,11 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
- vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
+ vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
push esi
push edi
- align 4
convertloop:
// replace VPGATHER
movzx esi, byte ptr [eax + 3] // alpha0
@@ -5123,9 +4029,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpackuswb ymm0, ymm0, ymm1 // unmutated.
- sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
+ sub ecx, 8
jg convertloop
pop edi
@@ -5148,18 +4054,17 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64
- align 4
convertloop:
- movdqa xmm0, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm0, xmm1
paddw xmm0, xmm5 // Add .5 for rounding.
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 G bytes
- movdqa xmm2, [eax] // A
- movdqa xmm3, [eax + 16]
+ movdqu xmm2, [eax] // A
+ movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrld xmm2, 24
psrld xmm3, 24
@@ -5171,10 +4076,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm1, xmm0
punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
+ sub ecx, 8
jg convertloop
ret
}
@@ -5208,32 +4113,31 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
movdqa xmm3, kARGBToSepiaG
movdqa xmm4, kARGBToSepiaR
- align 4
convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm6, [eax + 16]
+ movdqu xmm0, [eax] // B
+ movdqu xmm6, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 B values
- movdqa xmm5, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm5, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values
- movdqa xmm5, [eax] // R
- movdqa xmm1, [eax + 16]
+ movdqu xmm5, [eax] // R
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 R values
- movdqa xmm6, [eax] // A
- movdqa xmm1, [eax + 16]
+ movdqu xmm6, [eax] // A
+ movdqu xmm1, [eax + 16]
psrld xmm6, 24
psrld xmm1, 24
packuswb xmm6, xmm1
@@ -5242,10 +4146,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
movdqa xmm1, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm5 // BGRA first 4
punpckhwd xmm1, xmm5 // BGRA next 4
- sub ecx, 8
- movdqa [eax], xmm0
- movdqa [eax + 16], xmm1
+ movdqu [eax], xmm0
+ movdqu [eax + 16], xmm1
lea eax, [eax + 32]
+ sub ecx, 8
jg convertloop
ret
}
@@ -5271,14 +4175,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
pshufd xmm5, xmm5, 0xff
mov ecx, [esp + 16] /* width */
- align 4
convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm7, [eax + 16]
+ movdqu xmm0, [eax] // B
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm7, xmm2
- movdqa xmm6, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm6, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
phaddsw xmm0, xmm7 // B
@@ -5288,13 +4191,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
packuswb xmm0, xmm0 // 8 B values
packuswb xmm6, xmm6 // 8 G values
punpcklbw xmm0, xmm6 // 8 BG values
- movdqa xmm1, [eax] // R
- movdqa xmm7, [eax + 16]
+ movdqu xmm1, [eax] // R
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4
phaddsw xmm1, xmm7 // R
- movdqa xmm6, [eax] // A
- movdqa xmm7, [eax + 16]
+ movdqu xmm6, [eax] // A
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5
phaddsw xmm6, xmm7 // A
@@ -5306,11 +4209,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movdqa xmm6, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm1 // BGRA first 4
punpckhwd xmm6, xmm1 // BGRA next 4
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm6
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm6
lea eax, [eax + 32]
lea edx, [edx + 32]
+ sub ecx, 8
jg convertloop
ret
}
@@ -5319,7 +4222,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
@@ -5339,25 +4241,24 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
pcmpeqb xmm6, xmm6 // generate mask 0xff000000
pslld xmm6, 24
- align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
pmullw xmm0, xmm3 // * interval_size
- movdqa xmm7, [eax] // read 4 pixels
+ movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2
paddw xmm1, xmm4
packuswb xmm0, xmm1
por xmm0, xmm7
- sub ecx, 4
- movdqa [eax], xmm0
+ movdqu [eax], xmm0
lea eax, [eax + 16]
+ sub ecx, 4
jg convertloop
ret
}
@@ -5366,7 +4267,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
@@ -5378,9 +4278,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2
- align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2
@@ -5390,9 +4289,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
ret
@@ -5413,7 +4312,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
- align 4
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm2, [esi] // read 4 pixels from src_argb1
@@ -5428,9 +4326,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
pop esi
@@ -5455,16 +4353,15 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
sub ecx, 4
jl convertloop49
- align 4
convertloop4:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jge convertloop4
convertloop49:
@@ -5477,9 +4374,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge convertloop1
convertloop19:
@@ -5501,16 +4398,15 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- align 4
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb0 - src_argb1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
pop esi
@@ -5532,7 +4428,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
- align 4
convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32]
@@ -5569,7 +4464,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- align 4
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32]
@@ -5599,7 +4493,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- align 4
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32]
@@ -5638,7 +4531,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
sub edx, eax
pxor xmm5, xmm5 // constant 0
- align 4
convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
@@ -5662,9 +4554,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
psubw xmm1, xmm0
pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
- sub ecx, 8
movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8]
+ sub ecx, 8
jg convertloop
pop edi
@@ -5692,7 +4584,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
sub edx, eax
pxor xmm5, xmm5 // constant 0
- align 4
convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
@@ -5716,9 +4607,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
psubw xmm1, xmm0
pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
- sub ecx, 8
movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8]
+ sub ecx, 8
jg convertloop
pop esi
@@ -5746,10 +4637,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
pcmpeqb xmm5, xmm5 // alpha 255
pslld xmm5, 24 // 0xff000000
- align 4
convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
movdqa xmm2, xmm0 // GG
@@ -5765,12 +4655,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
punpckhwd xmm0, xmm0 // Last 4
por xmm3, xmm5 // GGGA
por xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm1
- movdqa [edx + 16], xmm2
- movdqa [edx + 32], xmm3
- movdqa [edx + 48], xmm0
+ movdqu [edx], xmm1
+ movdqu [edx + 16], xmm2
+ movdqu [edx + 32], xmm3
+ movdqu [edx + 48], xmm0
lea edx, [edx + 64]
+ sub ecx, 16
jg convertloop
pop esi
@@ -5792,15 +4682,14 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
mov ecx, [esp + 4 + 16] // width
sub esi, eax
- align 4
convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
pop esi
@@ -5827,10 +4716,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
sub esi, eax
pcmpeqb xmm5, xmm5 // alpha 255
- align 4
convertloop:
- movdqa xmm0, [eax] // read 16 pixels src_sobelx
- movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
movdqa xmm2, xmm0
paddusb xmm2, xmm1 // sobel = sobelx + sobely
@@ -5846,12 +4734,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
movdqa xmm7, xmm1 // YSXA
punpcklwd xmm7, xmm0 // Next 4
punpckhwd xmm1, xmm0 // Last 4
- sub ecx, 16
- movdqa [edx], xmm6
- movdqa [edx + 16], xmm4
- movdqa [edx + 32], xmm7
- movdqa [edx + 48], xmm1
+ movdqu [edx], xmm6
+ movdqu [edx + 16], xmm4
+ movdqu [edx + 32], xmm7
+ movdqu [edx + 48], xmm1
lea edx, [edx + 64]
+ sub ecx, 16
jg convertloop
pop esi
@@ -5872,8 +4760,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// area is the number of pixels in the area being averaged.
// dst points to pixel to store result to.
// count is number of averaged pixels to produce.
-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
-// aligned.
+// Does 4 pixels at a time.
void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst,
int count) {
@@ -5903,13 +4790,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
packssdw xmm5, xmm5 // 16 bit shorts
// 4 pixel loop small blocks.
- align 4
s4:
// top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
@@ -5946,13 +4832,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
jmp l4b
// 4 pixel loop
- align 4
l4:
// top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
@@ -5999,9 +4884,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
jl l1b
// 1 pixel loop
- align 4
l1:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
lea eax, [eax + 16]
psubd xmm0, [esi]
@@ -6040,7 +4924,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
jne l4b
// 4 pixel loop
- align 4
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@@ -6057,26 +4940,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
- movdqa xmm2, [esi] // previous row above.
+ movdqu xmm2, [esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
- movdqa xmm3, [esi + 16]
+ movdqu xmm3, [esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
- movdqa xmm4, [esi + 32]
+ movdqu xmm4, [esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
- movdqa xmm5, [esi + 48]
+ movdqu xmm5, [esi + 48]
lea esi, [esi + 64]
paddd xmm5, xmm0
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
- movdqa [edx + 32], xmm4
- movdqa [edx + 48], xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ movdqu [edx + 32], xmm4
+ movdqu [edx + 48], xmm5
lea edx, [edx + 64]
sub ecx, 4
@@ -6087,7 +4970,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
jl l1b
// 1 pixel loop
- align 4
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4]
@@ -6142,7 +5024,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop
- align 4
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
@@ -6164,9 +5045,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
movd xmm0, [eax + edi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3
addps xmm3, xmm4 // x, y += dx, dy next 2
- sub ecx, 4
movq qword ptr 8[edx], xmm6
lea edx, [edx + 16]
+ sub ecx, 4
jge l4
l4b:
@@ -6174,7 +5055,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
jl l1b
// 1 pixel loop
- align 4
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
@@ -6182,9 +5062,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
addps xmm2, xmm7 // x, y += dx, dy
movd esi, xmm0
movd xmm0, [eax + esi] // copy a pixel
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge l1
l1b:
pop edi
@@ -6195,11 +5075,11 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
#endif // HAS_ARGBAFFINEROW_SSE2
#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 16x2 -> 16x1
+// Bilinear filter 32x2 -> 32x1
__declspec(naked) __declspec(align(16))
void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
__asm {
push esi
push edi
@@ -6229,7 +5109,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpxor ymm0, ymm0, ymm0
vpermd ymm5, ymm0, ymm5
- align 4
xloop:
vmovdqu ymm0, [esi]
vmovdqu ymm2, [esi + edx]
@@ -6240,51 +5119,49 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpsrlw ymm0, ymm0, 7
vpsrlw ymm1, ymm1, 7
vpackuswb ymm0, ymm0, ymm1 // unmutates
- sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- vmovdqu ymm0, [esi]
- vpavgb ymm0, ymm0, [esi + edx]
- vpavgb ymm0, ymm0, [esi + edx]
- sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- vmovdqu ymm0, [esi]
- vpavgb ymm0, ymm0, [esi + edx]
- sub ecx, 32
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- vmovdqu ymm0, [esi + edx]
- vpavgb ymm0, ymm0, [esi]
- vpavgb ymm0, ymm0, [esi]
sub ecx, 32
- vmovdqu [esi + edi], ymm0
- lea esi, [esi + 32]
- jg xloop75
+ jg xloop
jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- rep movsb
+ // Blend 25 / 75.
+ xloop25:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm1, [esi + edx]
+ vpavgb ymm0, ymm0, ymm1
+ vpavgb ymm0, ymm0, ymm1
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ xloop75:
+ vmovdqu ymm1, [esi]
+ vmovdqu ymm0, [esi + edx]
+ vpavgb ymm0, ymm0, ymm1
+ vpavgb ymm0, ymm0, ymm1
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ rep movsb
xloop99:
pop edi
@@ -6295,7 +5172,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_AVX2
-#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16))
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -6329,226 +5205,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
- align 4
- xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
- cmp eax, 64
- je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- cmp eax, 192
- je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
-
- movd xmm5, eax // xmm5 = y fraction
- punpcklbw xmm5, xmm5
- psrlw xmm5, 1
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- punpcklqdq xmm5, xmm5
- pxor xmm4, xmm4
-
- align 4
- xloop:
- movdqa xmm0, [esi] // row0
- movdqa xmm2, [esi + edx] // row1
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- psubw xmm2, xmm0 // row1 - row0
- psubw xmm3, xmm1
- paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
- paddw xmm3, xmm3
- pmulhw xmm2, xmm5 // scale diff
- pmulhw xmm3, xmm5
- paddw xmm0, xmm2 // sum rows
- paddw xmm1, xmm3
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
- jmp xloop99
-
- // Blend 25 / 75.
- align 4
- xloop25:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop25
- jmp xloop99
-
- // Blend 50 / 50.
- align 4
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
- jmp xloop99
-
- // Blend 75 / 25.
- align 4
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
- jmp xloop99
-
- // Blend 100 / 0 - Copy row unchanged.
- align 4
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- xloop99:
- pop edi
- pop esi
- ret
- }
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
- // Dispatch to specialized filters if applicable.
- cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
- cmp eax, 32
- je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
- cmp eax, 96
- je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
-
- movd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- align 4
xloop:
movdqu xmm0, [esi]
movdqu xmm2, [esi + edx]
@@ -6560,57 +5216,53 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
psrlw xmm0, 7
psrlw xmm1, 7
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop
jmp xloop99
// Blend 25 / 75.
- align 4
xloop25:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop25
jmp xloop99
// Blend 50 / 50.
- align 4
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop50
jmp xloop99
// Blend 75 / 25.
- align 4
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
- align 4
xloop100:
movdqu xmm0, [esi]
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop100
xloop99:
@@ -6623,9 +5275,9 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
__asm {
push esi
push edi
@@ -6653,7 +5305,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
punpcklqdq xmm5, xmm5
pxor xmm4, xmm4
- align 4
xloop:
movdqu xmm0, [esi] // row0
movdqu xmm2, [esi + edx] // row1
@@ -6672,57 +5323,53 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
paddw xmm0, xmm2 // sum rows
paddw xmm1, xmm3
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop
jmp xloop99
// Blend 25 / 75.
- align 4
xloop25:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop25
jmp xloop99
// Blend 50 / 50.
- align 4
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop50
jmp xloop99
// Blend 75 / 25.
- align 4
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
- align 4
xloop100:
movdqu xmm0, [esi]
- sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop100
xloop99:
@@ -6733,84 +5380,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_SSE2
-__declspec(naked) __declspec(align(16))
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // src_uv_stride
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- sub edi, eax
-
- align 4
- convertloop:
- movdqa xmm0, [eax]
- pavgb xmm0, [eax + edx]
- sub ecx, 16
- movdqa [eax + edi], xmm0
- lea eax, [eax + 16]
- jg convertloop
- pop edi
- ret
- }
-}
-
-#ifdef HAS_HALFROW_AVX2
-__declspec(naked) __declspec(align(16))
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // src_uv_stride
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- sub edi, eax
-
- align 4
- convertloop:
- vmovdqu ymm0, [eax]
- vpavgb ymm0, ymm0, [eax + edx]
- sub ecx, 32
- vmovdqu [eax + edi], ymm0
- lea eax, [eax + 32]
- jg convertloop
-
- pop edi
- vzeroupper
- ret
- }
-}
-#endif // HAS_HALFROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
- movd xmm5, [esp + 12] // selector
- mov ecx, [esp + 16] // pix
- pshufd xmm5, xmm5, 0
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- punpckldq xmm0, xmm1
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
- ret
- }
-}
-
// Specialized ARGB to Bayer that just isolates G channel.
__declspec(naked) __declspec(align(16))
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
@@ -6823,10 +5392,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
psrld xmm5, 24
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrld xmm0, 8 // Move green to bottom.
psrld xmm1, 8
@@ -6834,9 +5402,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
pand xmm1, xmm5
packssdw xmm0, xmm1
packuswb xmm0, xmm1
- sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
+ sub ecx, 8
jg wloop
ret
}
@@ -6850,46 +5418,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
- movdqa xmm5, [ecx]
- mov ecx, [esp + 16] // pix
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- jg wloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- movdqa xmm5, [ecx]
+ movdqu xmm5, [ecx]
mov ecx, [esp + 16] // pix
- align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pshufb xmm0, xmm5
pshufb xmm1, xmm5
- sub ecx, 8
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
+ sub ecx, 8
jg wloop
ret
}
@@ -6906,17 +5447,16 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // pix
- align 4
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpshufb ymm0, ymm0, ymm5
vpshufb ymm1, ymm1, ymm5
- sub ecx, 16
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
+ sub ecx, 16
jg wloop
vzeroupper
@@ -6967,7 +5507,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
jg shuf_any1
jmp shuf99
- align 4
shuf_0123:
movdqu xmm0, [eax]
lea eax, [eax + 16]
@@ -6979,13 +5518,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 01Bh
pshuflw xmm1, xmm1, 01Bh
packuswb xmm0, xmm1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg shuf_0123
jmp shuf99
- align 4
shuf_0321:
movdqu xmm0, [eax]
lea eax, [eax + 16]
@@ -6997,13 +5535,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 039h
pshuflw xmm1, xmm1, 039h
packuswb xmm0, xmm1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg shuf_0321
jmp shuf99
- align 4
shuf_2103:
movdqu xmm0, [eax]
lea eax, [eax + 16]
@@ -7015,13 +5552,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 093h
pshuflw xmm1, xmm1, 093h
packuswb xmm0, xmm1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg shuf_2103
jmp shuf99
- align 4
shuf_3012:
movdqu xmm0, [eax]
lea eax, [eax + 16]
@@ -7033,9 +5569,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 0C6h
pshuflw xmm1, xmm1, 0C6h
packuswb xmm0, xmm1
- sub ecx, 4
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg shuf_3012
shuf99:
@@ -7066,7 +5602,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width
sub edx, esi
- align 4
convertloop:
movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
@@ -7104,7 +5639,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width
sub edx, esi
- align 4
convertloop:
movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
@@ -7141,7 +5675,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop.
- align 4
convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
@@ -7177,9 +5710,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
cvttps2dq xmm4, xmm4
packuswb xmm0, xmm4
packuswb xmm0, xmm0
- sub ecx, 2
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
+ sub ecx, 2
jg convertloop
pop esi
ret
@@ -7203,7 +5736,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
mov ecx, [esp + 16] /* width */
// 2 pixel loop.
- align 4
convertloop:
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
lea eax, [eax + 8]
@@ -7217,9 +5749,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
- sub ecx, 2
vmovq qword ptr [edx], xmm0
lea edx, [edx + 8]
+ sub ecx, 2
jg convertloop
vzeroupper
ret
@@ -7239,7 +5771,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
- align 4
convertloop:
movzx edx, byte ptr [eax]
lea eax, [eax + 4]
@@ -7273,7 +5804,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
- align 4
convertloop:
movzx edx, byte ptr [eax]
lea eax, [eax + 4]
@@ -7315,7 +5845,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
pxor xmm5, xmm5
// 4 pixel loop.
- align 4
convertloop:
movdqu xmm0, qword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3
@@ -7382,9 +5911,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movzx edx, byte ptr [eax + 15] // copy alpha.
mov byte ptr [edi + 15], dl
- sub ecx, 4
lea eax, [eax + 16]
lea edi, [edi + 16]
+ sub ecx, 4
jg convertloop
pop edi
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc
index 5b33b5f048d..482c5a61e35 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc
@@ -57,20 +57,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
}
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
- ScaleRowDown2Box_Unaligned_SSE2);
- if (IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
- ScaleRowDown2Box_SSE2);
- }
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+ ScaleRowDown2Box_SSE2);
}
-#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -112,21 +107,15 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
ScaleRowDown2_16_NEON;
}
-#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ?
- ScaleRowDown2_Unaligned_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
- ScaleRowDown2Box_Unaligned_16_SSE2);
- if (IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
- ScaleRowDown2Box_16_SSE2);
- }
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+ ScaleRowDown2Box_16_SSE2);
}
-#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -168,13 +157,13 @@ static void ScalePlaneDown4(int src_width, int src_height,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
-#elif defined(HAS_SCALEROWDOWN4_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
-#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -212,14 +201,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
ScaleRowDown4_16_NEON;
}
-#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
ScaleRowDown4_16_SSE2;
}
-#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -271,8 +260,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
@@ -351,8 +339,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
@@ -445,9 +432,9 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
}
}
-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
@@ -456,7 +443,8 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
}
}
-#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -522,9 +510,9 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
}
}
-#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
@@ -533,7 +521,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
}
}
-#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -758,11 +747,11 @@ static void ScalePlaneBox(int src_width, int src_height,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
+ if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
- IS_ALIGNED(src_width, 16) &&
+ && IS_ALIGNED(src_width, 16)
#endif
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ) {
ScaleAddRows = ScaleAddRows_SSE2;
}
#endif
@@ -830,11 +819,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
+ if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
- IS_ALIGNED(src_width, 16) &&
+ && IS_ALIGNED(src_width, 16)
#endif
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ) {
ScaleAddRows = ScaleAddRows_16_SSE2;
}
#endif
@@ -886,29 +875,23 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -916,7 +899,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
@@ -924,7 +907,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
@@ -988,29 +971,23 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
+ InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
@@ -1018,7 +995,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
@@ -1026,7 +1003,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@@ -1087,29 +1064,23 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -1117,7 +1088,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
@@ -1125,7 +1096,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
@@ -1144,9 +1115,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
@@ -1226,29 +1195,23 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
+ InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(dst_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
@@ -1256,7 +1219,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
@@ -1264,7 +1227,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@@ -1283,9 +1246,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_16_C;
#if defined(HAS_SCALECOLS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
@@ -1366,9 +1327,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
@@ -1401,9 +1360,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleCols = ScaleColsUp2_16_C;
#if defined(HAS_SCALECOLS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc
index e339cd7c791..05b58e1baba 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc
@@ -53,16 +53,14 @@ static void ScaleARGBDown2(int src_width, int src_height,
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON;
}
@@ -98,14 +96,12 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
#endif
@@ -139,14 +135,13 @@ static void ScaleARGBDownEven(int src_width, int src_height,
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
@@ -190,29 +185,23 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
src_argb += xl * 4;
x -= (int)(xl << 16);
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(clip_src_width, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(clip_src_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -220,15 +209,15 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
@@ -286,29 +275,23 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -316,15 +299,15 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
@@ -346,9 +329,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@@ -427,18 +408,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(src_width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -446,7 +424,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
@@ -467,29 +445,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -497,15 +469,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
@@ -531,9 +503,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@@ -640,9 +610,7 @@ static void ScaleARGBSimple(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc
index e4b2acc41b1..96e2564b00d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc
@@ -885,31 +885,23 @@ void ScalePlaneVertical(int src_height,
assert(dst_height > 0);
src_argb += (x >> 16) * bpp;
#if defined(HAS_INTERPOLATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSE2;
- }
+ InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_SSSE3;
- }
+ InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@@ -917,15 +909,15 @@ void ScalePlaneVertical(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
@@ -967,31 +959,23 @@ void ScalePlaneVertical_16(int src_height,
assert(dst_height > 0);
src_argb += (x >> 16) * wpp;
#if defined(HAS_INTERPOLATEROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSE2;
- }
+ InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
- InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- InterpolateRow = InterpolateRow_16_SSSE3;
- }
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+ if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
@@ -999,15 +983,15 @@ void ScalePlaneVertical_16(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+ if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc
index 1b8a5ba58f2..7921219b5fa 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc
@@ -16,7 +16,8 @@ extern "C" {
#endif
// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
@@ -756,7 +757,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
-#endif // __ARM_NEON__
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc
index 64c7d10dbe1..fb68b67d29c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc
@@ -8,133 +8,122 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "libyuv/scale.h"
#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// This module is for GCC Neon.
+// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#ifdef HAS_SCALEROWDOWN2_NEON
+
// Read 32x1 throw away even pixels, and write 16x1.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
- // load even pixels into q0, odd into q1
+ // load even pixels into v0, odd into v1
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
- : "q0", "q1" // Clobber List
+ : "v0", "v1" // Clobber List
);
}
-#endif //HAS_SCALEROWDOWN2_NEON
-#ifdef HAS_SCALEROWDOWN2_NEON
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
- "add %1, %0 \n"
- ".p2align 2 \n"
+ "add %1, %1, %0 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "uaddlp v1.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
- : "q0", "q1", "q2", "q3" // Clobber List
+ : "v0", "v1", "v2", "v3" // Clobber List
);
}
-#endif //HAS_SCALEROWDOWN2_NEON
-#ifdef HAS_SCALEROWDOWN4_NEON
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
- : "q0", "q1", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN4_NEON
-#ifdef HAS_SCALEROWDOWN4_NEON
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
- "vld1.8 {q1}, [%3]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
- "vld1.8 {q2}, [%4]! \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %5, %5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "uadalp v0.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n"
+ "uadalp v0.8h, v3.16b \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1)
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_ptr1), // %3
- "+r"(src_ptr2), // %4
- "+r"(src_ptr3) // %5
+ "+r"(src_ptr1), // %2
+ "+r"(src_ptr2), // %3
+ "+r"(src_ptr3), // %4
+ "+r"(dst_width) // %5
:
- : "q0", "q1", "q2", "q3", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN4_NEON
-#ifdef HAS_SCALEROWDOWN34_NEON
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
@@ -142,136 +131,129 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- ".p2align 2 \n"
- "1: \n"
+ "1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
- : "d0", "d1", "d2", "d3", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN34_NEON
-#ifdef HAS_SCALEROWDOWN34_NEON
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
// 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
// (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "bgt 1b \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+ "v20", "memory", "cc"
);
}
-#endif //ScaleRowDown34_0_Box_NEON
-#ifdef HAS_SCALEROWDOWN34_NEON
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %2, %2, #24 \n"
// average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
- : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN34_NEON
-#ifdef HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEROWDOWN38_NEON
static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
static uvec8 kShuf38_2 =
- { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+ { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
@@ -285,504 +267,498 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
uint8* dst_ptr, int dst_width) {
asm volatile (
MEMACCESS(3)
- "vld1.8 {q3}, [%3] \n"
- ".p2align 2 \n"
- "1: \n"
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %2, %2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n"
+ "st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
- : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN38_NEON
-
-#ifdef HAS_SCALEROWDOWN38_NEON
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
+ ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(5)
- "vld1.16 {q13}, [%5] \n"
+ "ld1 {v29.8h}, [%5] \n"
MEMACCESS(6)
- "vld1.8 {q14}, [%6] \n"
+ "ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %4, %4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
- "vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "add v0.8h, v0.8h, v16.8h \n"
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q15 \n"
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
MEMACCESS(1)
- "vst1.8 {d3}, [%1]! \n"
+ "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride), // %3
- "+r"(src_ptr1) // %4
+ "+r"(tmp_src_stride), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(dst_width) // %4
: "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7
- : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+ "v30", "v31", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN38_NEON
-#ifdef HAS_SCALEROWDOWN38_NEON
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
+ // TODO(fbarchard): use src_stride directly for clang 3.5+.
+ ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(4)
- "vld1.16 {q13}, [%4] \n"
+ "ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
- ".p2align 2 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %3, %3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
+ "uqrshrn v2.8b, v2.8h, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// combine source lines
- "vadd.u16 q1, q3 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q13 \n"
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
MEMACCESS(1)
- "vst1.8 {d3}, [%1]! \n"
+ "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(dst_width) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v30", "v31", "memory", "cc"
);
}
-#endif //HAS_SCALEROWDOWN38_NEON
-#if 0
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
+ int y_fraction = 256 - source_y_fraction;
asm volatile (
"cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
"cmp %4, #64 \n"
- "beq 75f \n"
+ "b.eq 75f \n"
"cmp %4, #128 \n"
- "beq 50f \n"
+ "b.eq 50f \n"
"cmp %4, #192 \n"
- "beq 25f \n"
+ "b.eq 25f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
// General purpose row blend.
"1: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
- "vld1.8 {q0}, [%2]! \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n"
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
MEMACCESS(0)
- "vst1.8 {d1[7]}, [%0] \n"
+ "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
+ "+r"(source_y_fraction),// %4
+ "+r"(y_fraction) // %5
:
- : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
);
}
-#endif //0
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
- MEMACCESS(0)
- "vld2.32 {q0, q1}, [%0]! \n"
- MEMACCESS(0)
- "vld2.32 {q2, q3}, [%0]! \n"
+ MEMACCESS (0)
+ "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
+ MEMACCESS (0)
+ "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- MEMACCESS(1)
- "vst1.8 {q3}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
+ MEMACCESS (1)
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ MEMACCESS (1)
+ "st1 {v3.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r" (src_ptr), // %0
+ "+r" (dst), // %1
+ "+r" (dst_width) // %2
:
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
);
}
-#endif //HAS_SCALEARGBROWDOWN2_NEON
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
- ".p2align 2 \n"
"1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ MEMACCESS (0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS (1)
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ MEMACCESS (2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r" (src_ptr), // %0
+ "+r" (src_stride), // %1
+ "+r" (dst), // %2
+ "+r" (dst_width) // %3
:
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
);
}
-#endif //HAS_SCALEARGBROWDOWN2_NEON
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile (
- "mov r12, %3, lsl #2 \n"
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.32 {d0[0]}, [%0], r12 \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
MEMACCESS(0)
- "vld1.32 {d0[1]}, [%0], r12 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
MEMACCESS(0)
- "vld1.32 {d1[0]}, [%0], r12 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
- "vld1.32 {d1[1]}, [%0], r12 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
- : "r"(src_stepx) // %3
- : "memory", "cc", "r12", "q0"
+ : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3
+ : "memory", "cc", "v0"
);
}
-#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
+// TODO, might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
asm volatile (
- "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
- ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
- "vld1.8 {d1}, [%1], r12 \n"
+ "ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0], r12 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
- "vld1.8 {d3}, [%1], r12 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
- "vld1.8 {d4}, [%0], r12 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
- "vld1.8 {d5}, [%1], r12 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
- "vld1.8 {d6}, [%0], r12 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
"+r"(dst_width) // %3
- : "r"(src_stepx) // %4
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ : "r"(src_stepx * 4) // %4
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
-#endif // HAS_SCALEARGBROWDOWNEVEN_NEON
-#endif // __aarch64__
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc
index 352e6678221..bb6e57efe32 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc
@@ -101,110 +101,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -218,17 +114,12 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1"
);
}
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -236,7 +127,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
@@ -254,17 +145,12 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -274,7 +160,6 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
- BUNDLEALIGN
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
@@ -296,13 +181,8 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -315,8 +195,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
@@ -330,11 +210,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
@@ -348,18 +224,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
- MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
- MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
+ MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
+ MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
+ MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm4,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
@@ -388,13 +262,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"+r"(dst_width), // %2
"+r"(stridex3) // %3
: "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
);
}
@@ -412,8 +281,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"palignr $0x8,%%xmm0,%%xmm1 \n"
@@ -429,11 +298,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -461,8 +326,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
@@ -479,9 +344,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
"psrlw $0x2,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm4,%%xmm6 \n"
@@ -498,13 +362,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -533,8 +392,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm6 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
"pavgb %%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm2,%%xmm6 \n"
@@ -553,8 +412,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
"psrlw $0x2,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
@@ -572,13 +431,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -590,8 +444,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm4,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
@@ -607,10 +461,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
"+r"(dst_width) // %2
: "m"(kShuf38a), // %3
"m"(kShuf38b) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
);
}
@@ -631,9 +482,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pshufb %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,%%xmm6 \n"
@@ -643,23 +495,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm0,%%xmm1 \n"
"pmulhuw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "sub $0x6,%2 \n"
"movd %%xmm1," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm1 \n"
"movd %%xmm1," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
+ "sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@@ -679,8 +526,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
"movhlps %%xmm0,%%xmm1 \n"
"movhlps %%xmm6,%%xmm7 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
@@ -689,7 +536,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"punpcklbw %%xmm5,%%xmm7 \n"
"paddusw %%xmm6,%%xmm0 \n"
"paddusw %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
+ MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
"lea " MEMLEA(0x10,0) ",%0 \n"
"movhlps %%xmm6,%%xmm7 \n"
"punpcklbw %%xmm5,%%xmm6 \n"
@@ -711,23 +558,18 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm7,%%xmm6 \n"
"pmulhuw %%xmm4,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
- "sub $0x6,%2 \n"
"movd %%xmm6," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm6 \n"
"movd %%xmm6," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
+ "sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@@ -741,7 +583,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"mov %0,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -753,7 +595,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"2: \n"
- "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
@@ -765,8 +607,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"3: \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%4 \n"
@@ -778,10 +620,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"+r"(src_width), // %4
"+rm"(src_height) // %5
: "rm"((intptr_t)(src_stride)) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
@@ -813,7 +652,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
"movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm1 \n"
- BUNDLEALIGN
MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
"movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n"
@@ -853,13 +691,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"+rm"(dst_width) // %5
: "rm"(x), // %6
"rm"(dx) // %7
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@@ -870,25 +703,21 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
- "sub $0x20,%2 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1"
);
}
@@ -898,22 +727,18 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1"
);
}
@@ -923,25 +748,21 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ :: "memory", "cc", "xmm0", "xmm1"
);
}
@@ -951,11 +772,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
- MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@@ -963,21 +783,16 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
);
}
@@ -996,29 +811,22 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"movd " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
"punpckldq %%xmm1,%%xmm0 \n"
- BUNDLEALIGN
MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
"+r"(dst_width), // %3
"+r"(src_stepx_x12) // %4
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
);
}
@@ -1040,11 +848,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"movq " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
- BUNDLEALIGN
MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"movq " MEMACCESS(5) ",%%xmm2 \n"
- BUNDLEALIGN
MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
@@ -1055,9 +861,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
@@ -1065,14 +871,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"+rm"(dst_width), // %3
"+r"(src_stepx_x12), // %4
"+r"(row1) // %5
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
);
}
@@ -1111,15 +911,14 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n"
- "sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
"test $0x2,%4 \n"
"je 29f \n"
- BUNDLEALIGN
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
"pextrw $0x5,%%xmm2,%k0 \n"
@@ -1139,13 +938,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"+r"(dst_width) // %4
: "rm"(x), // %5
"rm"(dx) // %6
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
@@ -1156,28 +950,22 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0," MEMACCESS(0) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
);
}
@@ -1225,7 +1013,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
"paddd %%xmm3,%%xmm2 \n"
MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
"psrlw $0x9,%%xmm1 \n"
- BUNDLEALIGN
MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
"pshufb %%xmm5,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
@@ -1245,7 +1032,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
"add $0x1,%2 \n"
"jl 99f \n"
"psrlw $0x9,%%xmm2 \n"
- BUNDLEALIGN
MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
"pshufb %%xmm5,%%xmm2 \n"
"pshufb %%xmm4,%%xmm0 \n"
@@ -1264,13 +1050,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
"+r"(x1) // %4
: "rm"(x), // %5
"rm"(dx) // %6
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc
index 840b9738da5..e0209cdec8c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc
@@ -103,118 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -222,9 +110,9 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg wloop
ret
@@ -234,9 +122,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
// Blends 32x1 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
@@ -245,7 +132,6 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
- align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -261,9 +147,9 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg wloop
ret
@@ -273,9 +159,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
@@ -285,7 +170,6 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
- align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -305,9 +189,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg wloop
pop esi
@@ -329,19 +213,18 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrld xmm5, 24
pslld xmm5, 16
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5
pand xmm1, xmm5
packuswb xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0
- sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
+ sub ecx, 8
jg wloop
ret
@@ -364,18 +247,17 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
- movdqa xmm2, [eax + esi * 2]
- movdqa xmm3, [eax + esi * 2 + 16]
- movdqa xmm4, [eax + edi]
- movdqa xmm5, [eax + edi + 16]
+ movdqu xmm2, [eax + esi * 2]
+ movdqu xmm3, [eax + esi * 2 + 16]
+ movdqu xmm4, [eax + edi]
+ movdqu xmm5, [eax + edi + 16]
lea eax, [eax + 32]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
@@ -398,9 +280,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm0, xmm2
packuswb xmm0, xmm0
- sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
+ sub ecx, 8
jg wloop
pop edi
@@ -427,10 +309,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf1
movdqa xmm5, kShuf2
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm1
palignr xmm1, xmm0, 8
@@ -481,10 +362,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
- align 4
wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
pavgb xmm0, xmm1
pshufb xmm0, xmm2
pmaddubsw xmm0, xmm5
@@ -501,8 +381,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm1
pshufb xmm0, xmm4
@@ -511,9 +391,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24]
+ sub ecx, 24
jg wloop
pop esi
@@ -540,10 +420,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
- align 4
wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm2
@@ -562,8 +441,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
@@ -573,9 +452,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx+24]
+ sub ecx, 24
jg wloop
pop esi
@@ -597,20 +476,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf38a
movdqa xmm5, kShuf38b
- align 4
xloop:
- movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
lea eax, [eax + 32]
pshufb xmm0, xmm4
pshufb xmm1, xmm5
paddusb xmm0, xmm1
- sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edx + 8], xmm1
lea edx, [edx + 12]
+ sub ecx, 12
jg xloop
ret
@@ -633,10 +511,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kScaleAc33
pxor xmm5, xmm5
- align 4
xloop:
- movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
- movdqa xmm6, [eax + esi]
+ movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqu xmm6, [eax + esi]
movhlps xmm1, xmm0
movhlps xmm7, xmm6
punpcklbw xmm0, xmm5
@@ -645,7 +522,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
punpcklbw xmm7, xmm5
paddusw xmm0, xmm6
paddusw xmm1, xmm7
- movdqa xmm6, [eax + esi * 2]
+ movdqu xmm6, [eax + esi * 2]
lea eax, [eax + 16]
movhlps xmm7, xmm6
punpcklbw xmm6, xmm5
@@ -671,11 +548,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6
- sub ecx, 6
movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16
movd [edx + 2], xmm6
lea edx, [edx + 6]
+ sub ecx, 6
jg xloop
pop esi
@@ -699,11 +576,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kShufAb2
movdqa xmm5, kScaleAb2
- align 4
xloop:
- movdqa xmm0, [eax] // average 2 rows into xmm0
- pavgb xmm0, [eax + esi]
+ movdqu xmm0, [eax] // average 2 rows into xmm0
+ movdqu xmm1, [eax + esi]
lea eax, [eax + 16]
+ pavgb xmm0, xmm1
movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
pshufb xmm1, xmm2
@@ -716,11 +593,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1
- sub ecx, 6
movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16
movd [edx + 2], xmm1
lea edx, [edx + 6]
+ sub ecx, 6
jg xloop
pop esi
@@ -747,10 +624,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pxor xmm4, xmm4
dec ebx
- align 4
xloop:
// first row
- movdqa xmm0, [esi]
+ movdqu xmm0, [esi]
lea eax, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
@@ -761,9 +637,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
je ydone
// sum remaining rows
- align 4
yloop:
- movdqa xmm2, [eax] // read 16 pixels
+ movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2
punpcklbw xmm2, xmm4
@@ -773,10 +648,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1
jg yloop
- align 4
ydone:
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm1
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
lea edi, [edi + 32]
sub ecx, 16
@@ -828,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop.
- align 4
xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx
@@ -851,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 2 // 2 pixels
jge xloop2
- align 4
xloop29:
add ecx, 2 - 1
@@ -869,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd ebx, xmm0
mov [edi], bl
- align 4
xloop99:
pop edi
@@ -889,17 +760,16 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov eax, [esp + 8] // src_ptr
mov ecx, [esp + 12] // dst_width
- align 4
wloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
- sub ecx, 32
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
+ sub ecx, 32
jg wloop
ret
@@ -918,15 +788,14 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg wloop
ret
@@ -945,18 +814,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg wloop
ret
@@ -976,12 +844,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // dst_width
- align 4
wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
@@ -989,9 +856,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg wloop
pop esi
@@ -1016,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
- align 4
wloop:
movd xmm0, [eax]
movd xmm1, [eax + ebx]
@@ -1026,9 +892,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea eax, [eax + ebx * 4]
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg wloop
pop edi
@@ -1057,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
- align 4
wloop:
movq xmm0, qword ptr [eax] // row0 4 pairs
movhps xmm0, qword ptr [eax + ebx]
@@ -1075,9 +940,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg wloop
pop edi
@@ -1118,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
jl xloop49
// 4 Pixel loop.
- align 4
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
@@ -1133,12 +997,11 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
- sub ecx, 4 // 4 pixels
movdqu [edi], xmm0
lea edi, [edi + 16]
+ sub ecx, 4 // 4 pixels
jge xloop4
- align 4
xloop49:
test ecx, 2
je xloop29
@@ -1159,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
- align 4
xloop99:
pop esi
@@ -1209,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop.
- align 4
xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx
@@ -1229,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
sub ecx, 2 // 2 pixels
jge xloop2
- align 4
xloop29:
add ecx, 2 - 1
@@ -1246,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0
- align 4
xloop99:
pop edi
@@ -1265,17 +1124,16 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
mov eax, [esp + 8] // src_argb
mov ecx, [esp + 12] // dst_width
- align 4
wloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1
- sub ecx, 8
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
+ sub ecx, 8
jg wloop
ret
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc
index efbedf46e2b..379a0669ae6 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc
@@ -33,7 +33,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
- {FOURCC_BA81, FOURCC_BGGR},
+ {FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx
index 02cd9ab4edc..343bcf9624b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx
@@ -9,3 +9,4 @@ defines that help automatically allow assembly to work cross-platform.
Local Modifications:
Some modifications to allow PIC to work with x86inc.
+Conditionally define program_name to allow overriding.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm
index 99453a99854..bc8116995dd 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm
@@ -36,7 +36,9 @@
%include "vpx_config.asm"
+%ifndef program_name
%define program_name vp9
+%endif
%define UNIX64 0
@@ -78,6 +80,9 @@
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho64
SECTION .text align=%1
+ %elifidn __OUTPUT_FORMAT__,macho32
+ SECTION .text align=%1
+ fakegot:
%elifidn __OUTPUT_FORMAT__,macho
SECTION .text align=%1
fakegot:
@@ -617,9 +622,17 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%elifidn __OUTPUT_FORMAT__,elf64
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,macho32
- global %1:private_extern
+ %ifdef __NASM_VER__
+ global %1
+ %else
+ global %1:private_extern
+ %endif
%elifidn __OUTPUT_FORMAT__,macho64
- global %1:private_extern
+ %ifdef __NASM_VER__
+ global %1
+ %else
+ global %1:private_extern
+ %endif
%else
global %1
%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/usage.dox b/chromium/third_party/libvpx/source/libvpx/usage.dox
index 237b8dc42bf..88235202d17 100644
--- a/chromium/third_party/libvpx/source/libvpx/usage.dox
+++ b/chromium/third_party/libvpx/source/libvpx/usage.dox
@@ -12,13 +12,13 @@
- \ref usage_init
- \ref usage_errors
- Fore more information on decoder and encoder specific usage, see the
+ For more information on decoder and encoder specific usage, see the
following pages:
\if decoder
- - \subpage usage_decode
+ \li \subpage usage_decode
\endif
- \if decoder
- - \subpage usage_encode
+ \if encoder
+ \li \subpage usage_encode
\endif
\section usage_types Important Data Types
@@ -80,10 +80,13 @@
The available initialization methods are:
- \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
- \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
- \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
-
+ \if encoder
+ \li #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+ \li #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+ \endif
+ \if decoder
+ \li #vpx_codec_dec_init (calls vpx_codec_dec_init_ver())
+ \endif
\section usage_errors Error Handling
diff --git a/chromium/third_party/libvpx/source/libvpx/usage_cx.dox b/chromium/third_party/libvpx/source/libvpx/usage_cx.dox
index 62f3e450b0e..92b0d34ef4d 100644
--- a/chromium/third_party/libvpx/source/libvpx/usage_cx.dox
+++ b/chromium/third_party/libvpx/source/libvpx/usage_cx.dox
@@ -1,4 +1,4 @@
-/*! \page usage_encode Encode
+/*! \page usage_encode Encoding
The vpx_codec_encode() function is at the core of the encode loop. It
processes raw images passed by the application, producing packets of
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c
index 54afc13355a..b9d875a2ff7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c
@@ -103,9 +103,9 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
goto allocation_fail;
oci->post_proc_buffer_int_used = 0;
- vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
- vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
- oci->post_proc_buffer.frame_size);
+ memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+ memset(oci->post_proc_buffer.buffer_alloc, 128,
+ oci->post_proc_buffer.frame_size);
/* Allocate buffer to store post-processing filter coefficients.
*
@@ -176,7 +176,7 @@ void vp8_create_common(VP8_COMMON *oci)
oci->clamp_type = RECON_CLAMP_REQUIRED;
/* Initialize reference frame sign bias structure to defaults */
- vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+ memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
/* Default disable buffer to buffer copying */
oci->copy_buffer_to_gf = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
index 2510ad83835..db48ded5827 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -165,7 +165,7 @@ vp8_dequant_idct_loop2_v6
str r1, [r2], r12 ; store output to dst
bne vp8_dequant_idct_loop2_v6
-; vpx_memset
+; memset
sub r0, r0, #32
add sp, sp, #4
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c
index 7fe39674eb6..d6a6781d862 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c
@@ -99,7 +99,7 @@ void vp8_sixtap_predict4x4_armv6
{
const short *HFilter;
const short *VFilter;
- DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+ DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
@@ -147,7 +147,7 @@ void vp8_sixtap_predict8x8_armv6
{
const short *HFilter;
const short *VFilter;
- DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+ DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
@@ -189,7 +189,7 @@ void vp8_sixtap_predict16x16_armv6
{
const short *HFilter;
const short *VFilter;
- DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16); /* Temp data buffer used in filtering */
+ DECLARE_ALIGNED(4, short, FData[24*16]); /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c
deleted file mode 100644
index 6595ac0519b..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_sad8x8_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
-
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 7; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
-
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad8x16_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
-
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 15; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
-
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad4x4_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x2_t d1;
- uint64x1_t d3;
- int i;
-
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 3; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
-
- d1 = vpaddl_u16(vget_low_u16(q12));
- d3 = vpaddl_u32(d1);
-
- return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int vp8_sad16x16_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x16_t q0, q4;
- uint16x8_t q12, q13;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
-
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
- for (i = 0; i < 15; i++) {
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
- }
-
- q12 = vaddq_u16(q12, q13);
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad16x8_neon(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride) {
- uint8x16_t q0, q4;
- uint16x8_t q12, q13;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
-
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
- for (i = 0; i < 7; i++) {
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
- }
-
- q12 = vaddq_u16(q12, q13);
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
index 8308d555b37..974d3b6532b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -32,7 +32,7 @@ unsigned int vp8_sub_pixel_variance16x16_neon_func(
int dst_pixels_per_line,
unsigned int *sse) {
int i;
- DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
+ DECLARE_ALIGNED(16, unsigned char, tmp[528]);
unsigned char *tmpp;
unsigned char *tmpp2;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
@@ -911,12 +911,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_neon(
return vget_lane_u32(d0u32, 0);
}
-enum { kWidth8 = 8 };
-enum { kHeight8 = 8 };
-enum { kHeight8PlusOne = 9 };
-enum { kPixelStepOne = 1 };
-enum { kAlign16 = 16 };
-
#define FILTER_BITS 7
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
@@ -968,8 +962,8 @@ static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
- return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (8 * 8));
}
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -1003,22 +997,21 @@ unsigned int vp8_sub_pixel_variance8x8_neon(
const unsigned char *dst,
int dst_stride,
unsigned int *sse) {
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+ DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
if (xoffset == 0) {
- var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
- kWidth8, bilinear_taps_coeff[yoffset]);
+ var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,
+ 8, bilinear_taps_coeff[yoffset]);
} else if (yoffset == 0) {
- var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
- kHeight8PlusOne, kWidth8,
+ var_filter_block2d_bil_w8(src, temp2, src_stride, 1,
+ 9, 8,
bilinear_taps_coeff[xoffset]);
} else {
- var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
- kHeight8PlusOne, kWidth8,
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+ 9, 8,
bilinear_taps_coeff[xoffset]);
- var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
- kWidth8, bilinear_taps_coeff[yoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+ 8, bilinear_taps_coeff[yoffset]);
}
- return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+ return variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
index ea1a6a4adfd..192108a06db 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
@@ -187,8 +187,12 @@ typedef struct
{
FRAME_TYPE frame_type;
int is_frame_dropped;
+ // The frame rate for the lowest resolution.
+ double low_res_framerate;
/* The frame number of each reference frames */
unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+ // The video frame counter value for the key frame, for lowest resolution.
+ unsigned int key_frame_counter_value;
LOWER_RES_MB_INFO *mb_info;
} LOWER_RES_FRAME_INFO;
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
index 17262d6983c..ba3d9f54d1e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
@@ -29,19 +29,19 @@ extern "C" {
#define vp8_copy( Dest, Src) { \
assert( sizeof( Dest) == sizeof( Src)); \
- vpx_memcpy( Dest, Src, sizeof( Src)); \
+ memcpy( Dest, Src, sizeof( Src)); \
}
/* Use this for variably-sized arrays. */
#define vp8_copy_array( Dest, Src, N) { \
assert( sizeof( *Dest) == sizeof( *Src)); \
- vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+ memcpy( Dest, Src, N * sizeof( *Src)); \
}
-#define vp8_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest));
+#define vp8_zero( Dest) memset( &Dest, 0, sizeof( Dest));
-#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
+#define vp8_zero_array( Dest, N) memset( Dest, 0, N * sizeof( *Dest));
#ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c
new file mode 100644
index 00000000000..fd96c863491
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
+ unsigned char *dst_ptr, int dst_stride,
+ int height)
+{
+ int r;
+
+ for (r = 0; r < height; r++)
+ {
+ memcpy(dst_ptr, src_ptr, 32);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c
index 46064e61d53..159fddc6a76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c
@@ -81,7 +81,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
fprintf(mvs, "\n");
/* print out the block modes */
- mb_index = 0;
fprintf(mvs, "Mbs for Frame %d\n", frame);
{
int b_row;
@@ -129,7 +128,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
/* print out the block modes */
- mb_index = 0;
fprintf(mvs, "MVs for Frame %d\n", frame);
{
int b_row;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c
index 6e2f69a773e..f8b04fa4ee5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c
@@ -38,6 +38,6 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
- vpx_memset(input, 0, 32);
+ memset(input, 0, 32);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c
index 8c046a4f57c..c00e565f063 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c
@@ -183,7 +183,6 @@ const vp8_extra_bit_struct vp8_extra_bits[12] =
void vp8_default_coef_probs(VP8_COMMON *pc)
{
- vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
- sizeof(default_coef_probs));
+ memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c
index 091e4c732b0..8981a8d3c2a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c
@@ -159,13 +159,13 @@ const vp8_tree_index vp8_small_mvtree [14] =
void vp8_init_mbmode_probs(VP8_COMMON *x)
{
- vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
- vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
- vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+ memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+ memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+ memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
}
void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
{
- vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+ memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c
index c9bdd21897d..2d938ad7825 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c
@@ -40,9 +40,9 @@ static void copy_and_extend_plane
for (i = 0; i < h; i++)
{
- vpx_memset(dest_ptr1, src_ptr1[0], el);
- vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
- vpx_memset(dest_ptr2, src_ptr2[0], er);
+ memset(dest_ptr1, src_ptr1[0], el);
+ memcpy(dest_ptr1 + el, src_ptr1, w);
+ memset(dest_ptr2, src_ptr2[0], er);
src_ptr1 += sp;
src_ptr2 += sp;
dest_ptr1 += dp;
@@ -60,13 +60,13 @@ static void copy_and_extend_plane
for (i = 0; i < et; i++)
{
- vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+ memcpy(dest_ptr1, src_ptr1, linesize);
dest_ptr1 += dp;
}
for (i = 0; i < eb; i++)
{
- vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+ memcpy(dest_ptr2, src_ptr2, linesize);
dest_ptr2 += dp;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c
index 65d5002c8bc..8aa7d9bf0ff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c
@@ -33,7 +33,7 @@ void vp8_dequant_idct_add_y_block_c
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
- vpx_memset(q, 0, 2 * sizeof(q[0]));
+ memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
@@ -59,7 +59,7 @@ void vp8_dequant_idct_add_uv_block_c
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
- vpx_memset(q, 0, 2 * sizeof(q[0]));
+ memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
@@ -78,7 +78,7 @@ void vp8_dequant_idct_add_uv_block_c
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
- vpx_memset(q, 0, 2 * sizeof(q[0]));
+ memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
index 7a07e76fc41..8b55dff92bf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
@@ -82,11 +82,10 @@ void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
if (block_inside_limit < 1)
block_inside_limit = 1;
- vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
- SIMD_WIDTH);
- vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
- SIMD_WIDTH);
+ memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+ memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+ memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
}
}
@@ -105,7 +104,7 @@ void vp8_loop_filter_init(VP8_COMMON *cm)
/* init hev threshold const vectors */
for(i = 0; i < 4 ; i++)
{
- vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ memset(lfi->hev_thr[i], i, SIMD_WIDTH);
}
}
@@ -151,7 +150,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
/* we could get rid of this if we assume that deltas are set to
* zero when not in use; encoder always uses deltas
*/
- vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+ memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
continue;
}
@@ -261,6 +260,7 @@ void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
int mb_col;
int filter_level;
loop_filter_info_n *lfi_n = &cm->lf_info;
+ (void)post_uvstride;
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c
index 069332660e3..d12dea19364 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c
@@ -17,10 +17,11 @@
* higher quality.
*/
-#include "postproc.h"
-#include "variance.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vp8/common/variance.h"
#include "vpx_mem/vpx_mem.h"
-#include "vp8_rtcd.h"
#include "vpx_scale/yv12config.h"
#include <limits.h>
@@ -153,16 +154,16 @@ static void multiframe_quality_enhance_block
actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
#ifdef USE_SSD
- sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+ vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 128)>>8;
- usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+ vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 32)>>6;
- vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+ vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 32)>>6;
#else
- sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
- usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
- vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
+ sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+ usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
+ vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
#endif
}
else /* if (blksize == 8) */
@@ -170,16 +171,16 @@ static void multiframe_quality_enhance_block
actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
#ifdef USE_SSD
- sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+ vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 32)>>6;
- usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+ vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 8)>>4;
- vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+ vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 8)>>4;
#else
- sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
- usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
- vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
+ sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
+ usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
+ vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
#endif
}
@@ -231,9 +232,9 @@ static void multiframe_quality_enhance_block
{
vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
- vpx_memcpy(udp, up, uvblksize);
+ memcpy(udp, up, uvblksize);
for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
- vpx_memcpy(vdp, vp, uvblksize);
+ memcpy(vdp, vp, uvblksize);
}
}
}
@@ -341,8 +342,8 @@ void vp8_multiframe_quality_enhance
for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
vp += show->uv_stride, vdp += dest->uv_stride)
{
- vpx_memcpy(udp, up, 4);
- vpx_memcpy(vdp, vp, 4);
+ memcpy(udp, up, 4);
+ memcpy(vdp, vp, 4);
}
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
index 619ee808d8d..fc3bb8ad9db 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -26,7 +26,7 @@ void vp8_dequant_idct_add_dspr2(short *input, short *dq,
vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
- vpx_memset(input, 0, 32);
+ memset(input, 0, 32);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
index d48c4fe5e3d..f39b675cd50 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
@@ -122,6 +122,7 @@ extern "C"
int Sharpness;
int cpu_used;
unsigned int rc_max_intra_bitrate_pct;
+ unsigned int screen_content_mode;
/* mode ->
*(0)=Realtime/Live Encoding. This mode is optimized for realtim
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
index 277f37194a9..266431a3240 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
@@ -355,8 +355,8 @@ void vp8_deblock(VP8_COMMON *cm,
else
mb_ppl = (unsigned char)ppl;
- vpx_memset(ylptr, mb_ppl, 16);
- vpx_memset(uvlptr, mb_ppl, 8);
+ memset(ylptr, mb_ppl, 16);
+ memset(uvlptr, mb_ppl, 8);
ylptr += 16;
uvlptr += 8;
@@ -403,7 +403,7 @@ void vp8_de_noise(VP8_COMMON *cm,
(void) low_var_thresh;
(void) flag;
- vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+ memset(limits, (unsigned char)ppl, 16 * mb_cols);
/* TODO: The original code don't filter the 2 outer rows and columns. */
for (mbr = 0; mbr < mb_rows; mbr++)
@@ -518,6 +518,7 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
unsigned int Width, unsigned int Height, int Pitch)
{
unsigned int i, j;
+ (void)bothclamp;
for (i = 0; i < Height; i++)
{
@@ -762,7 +763,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
/* insure that postproc is set to all 0's so that post proc
* doesn't pull random data in from edge
*/
- vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+ memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm
deleted file mode 100644
index a4ce9158342..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;# but the output will be. So two reads and a perm
-;# for the input, but only one store for the output.
-copy_mem16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xe000
- mtspr 256, r12 ;# set VRSAVE
-
- li r10, 16
- mtctr r10
-
-cp_16x16_loop:
- lvsl v0, 0, r3 ;# permutate value for alignment
-
- lvx v1, 0, r3
- lvx v2, r10, r3
-
- vperm v1, v1, v2, v0
-
- stvx v1, 0, r5
-
- add r3, r3, r4 ;# increment source pointer
- add r5, r5, r6 ;# increment destination pointer
-
- bdnz cp_16x16_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm
deleted file mode 100644
index 4da2e94f959..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm
+++ /dev/null
@@ -1,1013 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl sixtap_predict_ppc
- .globl sixtap_predict8x4_ppc
- .globl sixtap_predict8x8_ppc
- .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
- load_c \V0, HFilter, r5, r9, r10
-
- addi r5, r5, 16
- lvx \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
- load_c v0, VFilter, r6, r3, r10
-
- vspltish v5, 8
- vspltish v6, 3
- vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v1, v0, 1
- vspltb v2, v0, 2
- vspltb v3, v0, 3
- vspltb v4, v0, 4
- vspltb v5, v0, 5
- vspltb v0, v0, 0
-.endm
-
-.macro vpre_load
- Vprolog
- li r10, 16
- lvx v10, 0, r9 ;# v10..v14 = first 5 rows
- lvx v11, r10, r9
- addi r9, r9, 32
- lvx v12, 0, r9
- lvx v13, r10, r9
- addi r9, r9, 32
- lvx v14, 0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
- ;# (Re,Ro) += (V*T)
- vmuleub \TMP, \V, \T ;# trashes v8
- vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
- vmuloub \TMP, \V, \T
- vadduhm \Ro, \Ro, \TMP ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
- vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
- vadduhm v16, v6, v8
- vmuloub v8, \P0, v0
- vadduhm v17, v6, v8
- Msum v16, v17, \P2, v2, v8
- Msum v16, v17, \P3, v3, v8
- Msum v16, v17, \P5, v5, v8
-
- vmuleub v18, \P1, v1 ;# 2 negative taps
- vmuloub v19, \P1, v1
- Msum v18, v19, \P4, v4, v8
-
- vsubuhs v16, v16, v18 ;# subtract neg from pos
- vsubuhs v17, v17, v19
- vsrh v16, v16, v7 ;# divide by 128
- vsrh v17, v17, v7 ;# v16 v17 = evens, odds
- vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
- vmrglh v19, v16, v17
- vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
- vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
- vadduhm v21, v20, v24
- vmuloub v24, \P0, v13
- vadduhm v22, v20, v24
- Msum v21, v22, \P2, v15, v25
- Msum v21, v22, \P3, v16, v25
- Msum v21, v22, \P5, v18, v25
-
- vmuleub v23, \P1, v14 ;# 2 negative taps
- vmuloub v24, \P1, v14
- Msum v23, v24, \P4, v17, v25
-
- vsubuhs v21, v21, v23 ;# subtract neg from pos
- vsubuhs v22, v22, v24
- vsrh v21, v21, v19 ;# divide by 128
- vsrh v22, v22, v19 ;# v16 v17 = evens, odds
- vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
- vmrglh v24, v21, v22
- vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
- vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
- stvx \P0, 0, r7
- add r7, r7, r8 ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
- addi r9, r9, 16 ;# P5 = newest input row
- lvx \P5, 0, r9
- Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
- luma_v v10, v11, v12, v13, v14, v15
- luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
- luma_vtwo
- luma_v v12, v13, v14, v15, v10, v11
- luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
- luma_vfour
- luma_v v14, v15, v10, v11, v12, v13
- luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
- vmsummbm \R, v13, \I, v15
- vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
- lvsl v21, 0, \RS ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx \VD, 0, \RS
- lvx v20, r10, \RS
-
-.if \increment_counter
- add \RS, \RS, \RP
-.endif
-
- vperm \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
- vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
- vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
- Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
- vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
- Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
-
- vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
- vsrh \R, \R, v19
-
- vpkuhus \R, \R, \R ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
- lvsl v21, 0, \RS ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v20, 0, \RS
-
-.if \increment_counter
- add \RS, \RS, \RP
-.endif
-
- vperm \VD, v20, v20, v21
-.endm
- .text
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xff87
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- vertical_only_4x4
-
- ;# load up horizontal filter
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_4x4
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
-
- b second_pass_4x4
-
-vertical_only_4x4:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_4x4:
- load_c v20, b_hilo_4x4, 0, r9, r10
- load_c v21, b_hilo, 0, r9, r10
-
- ;# reposition input so that it can go through the
- ;# filtering phase with one pass.
- vperm v0, v0, v1, v20 ;# 0 1 x x
- vperm v2, v2, v3, v20 ;# 2 3 x x
- vperm v4, v4, v5, v20 ;# 4 5 x x
- vperm v6, v6, v7, v20 ;# 6 7 x x
-
- vperm v0, v0, v2, v21 ;# 0 1 2 3
- vperm v4, v4, v6, v21 ;# 4 5 6 7
-
- vsldoi v1, v0, v4, 4
- vsldoi v2, v0, v4, 8
- vsldoi v3, v0, v4, 12
-
- vsldoi v5, v4, v8, 4
-
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
- stvx v0, 0, r1
-
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 4(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 8(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 12(r1)
- stw r0, 0(r7)
-
- b exit_4x4
-
-store_4x4:
-
- stvx v2, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v3, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v4, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v5, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
-
-exit_4x4:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro w_8x8 V, D, R, P
- stvx \V, 0, r1
- lwz \R, 0(r1)
- stw \R, 0(r7)
- lwz \R, 4(r1)
- stw \R, 4(r7)
- add \D, \D, \P
-.endm
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- second_pass_pre_copy_8x4
-
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_8x4
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
-
- b second_pass_8x4
-
-second_pass_pre_copy_8x4:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_8x4:
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
- vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
- vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
- vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x4
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
-
- b exit_8x4
-
-store_aligned_8x4:
-
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
-
- b exit_8x4
-
-store_8x4:
- cmpi cr0, r8, 8
- beq cr0, store_aligned2_8x4
-
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
-
- b exit_8x4
-
-store_aligned2_8x4:
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
-
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
-
-exit_8x4:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;# register there is no need to loop. Everything can stay in registers.
-sixtap_predict8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- second_pass_pre_copy_8x8
-
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
- Read8x8 v9, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
- interp_8x8 v9
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_8x8
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v10, r3, r4, 1
- Read8x8 v11, r3, r4, 1
- Read8x8 v12, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v10
- interp_8x8 v11
- interp_8x8 v12
-
- b second_pass_8x8
-
-second_pass_pre_copy_8x8:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
- Read8x8 v9, r3, r4, 1
- Read8x8 v10, r3, r4, 1
- Read8x8 v11, r3, r4, 1
- Read8x8 v12, r3, r4, 0
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_8x8:
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
- vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
- vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
- vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
- vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
- vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
- vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
- vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x8
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
-
- b exit_8x8
-
-store_aligned_8x8:
-
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
-
- b exit_8x8
-
-store_8x8:
- cmpi cr0, r8, 8
- beq cr0, store_aligned2_8x8
-
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
- w_8x8 v8, r7, r0, r8
- w_8x8 v9, r7, r0, r8
-
- b exit_8x8
-
-store_aligned2_8x8:
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
- vperm v8, v8, v9, v10
-
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
- addi r7, r7, 16
- stvx v8, 0, r7
-
-exit_8x8:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
-;# edges. One of the filters can be null, but both won't be. Needs to use a
-;# temporary buffer because the source buffer can't be modified and the buffer
-;# for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xf000
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-416(r1) ;# create space on the stack
-
- ;# Three possiblities
- ;# 1. First filter is null. Don't use a temp buffer.
- ;# 2. Second filter is null. Don't use a temp buffer.
- ;# 3. Neither are null, use temp buffer.
-
- ;# First Pass (horizontal edge)
- ;# setup pointers for src
- ;# if possiblity (1) then setup the src pointer to be the orginal and jump
- ;# to second pass. this is based on if x_offset is 0.
-
- ;# load up horizontal filter
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- load_hfilter v4, v5
-
- beq- copy_horizontal_16x21
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v14, b_hperm, 0, r9, r10
-
- ;# These statements are guessing that there won't be a second pass,
- ;# but if there is then inside the bypass they need to be set
- li r0, 16 ;# prepare for no vertical filter
-
- ;# Change the output pointer and pitch to be the actual
- ;# desination instead of a temporary buffer.
- addi r9, r7, 0
- addi r5, r8, 0
-
- ;# no vertical filter, so write the output from the first pass
- ;# directly into the output buffer.
- beq- no_vertical_filter_bypass
-
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
-
- ;# setup counter for the number of lines that are going to be filtered
- li r0, 21
-
- ;# use the stack as temporary storage
- la r9, 48(r1)
- li r5, 16
-
-no_vertical_filter_bypass:
-
- mtctr r0
-
- ;# rounding added in on the multiply
- vspltisw v10, 8
- vspltisw v12, 3
- vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v13, 7
-
- ;# index to the next set of vectors in the row.
- li r10, 16
- li r12, 32
-
-horizontal_loop_16x16:
-
- lvsl v15, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v1, 0, r3
- lvx v2, r10, r3
- lvx v3, r12, r3
-
- vperm v8, v1, v2, v15
- vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
-
- vsldoi v11, v8, v9, 4
-
- ;# set 0
- vmsummbm v6, v4, v8, v12 ;# taps times elements
- vmsummbm v0, v5, v11, v6
-
- ;# set 1
- vsldoi v10, v8, v9, 1
- vsldoi v11, v8, v9, 5
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v1, v5, v11, v6
-
- ;# set 2
- vsldoi v10, v8, v9, 2
- vsldoi v11, v8, v9, 6
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v2, v5, v11, v6
-
- ;# set 3
- vsldoi v10, v8, v9, 3
- vsldoi v11, v8, v9, 7
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v3, v5, v11, v6
-
- vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
-
- vsrh v0, v0, v13 ;# divide v0, v1 by 128
- vsrh v1, v1, v13
-
- vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
- vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
-
- stvx v0, 0, r9
- add r9, r9, r5
-
- add r3, r3, r4
-
- bdnz horizontal_loop_16x16
-
- ;# check again to see if vertical filter needs to be done.
- cmpi cr0, r6, 0
- beq cr0, end_16x16
-
- ;# yes there is, so go to the second pass
- b second_pass_16x16
-
-copy_horizontal_16x21:
- li r10, 21
- mtctr r10
-
- li r10, 16
-
- sub r3, r3, r4
- sub r3, r3, r4
-
- ;# this is done above if there is a horizontal filter,
- ;# if not it needs to be done down here.
- slwi r6, r6, 4 ;# index into vertical filter array
-
- ;# always write to the stack when doing a horizontal copy
- la r9, 48(r1)
-
-copy_horizontal_loop_16x21:
- lvsl v15, 0, r3 ;# permutate value for alignment
-
- lvx v1, 0, r3
- lvx v2, r10, r3
-
- vperm v8, v1, v2, v15
-
- stvx v8, 0, r9
- addi r9, r9, 16
-
- add r3, r3, r4
-
- bdnz copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
- ;# always read from the stack when doing a vertical filter
- la r9, 48(r1)
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v7, 7
-
- vpre_load
-
- luma_vsix
- luma_vsix
- luma_vfour
-
-end_16x16:
-
- addi r1, r1, 416 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-HFilter:
- .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
- .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
- .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
- .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
- .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
- .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
- .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
- .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
- .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
- .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
- .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
- .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
- .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
- .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
-
- .align 4
-VFilter:
- .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
- .align 4
-b_hperm:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-B_0123:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-B_4567:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
- .align 4
-B_89AB:
- .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
- .align 4
-b_hilo:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-
- .align 4
-b_hilo_4x4:
- .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
deleted file mode 100644
index fd8aa665fdf..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null
@@ -1,677 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl bilinear_predict4x4_ppc
- .globl bilinear_predict8x4_ppc
- .globl bilinear_predict8x8_ppc
- .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
- load_c \V0, vfilter_b, r6, r9, r10
-
- addi r6, r6, 16
- lvx \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
- ;# load up horizontal filter
- slwi. r5, r5, 4 ;# index into horizontal filter array
-
- ;# index to the next set of vectors in the row.
- li r10, 16
- li r12, 32
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq \jump_label
-
- load_c v20, hfilter_b, r5, r9, r0
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v28, b_hperm_b, 0, r9, r0
-
- ;# rounding added in on the multiply
- vspltisw v21, 8
- vspltisw v18, 3
- vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
-
- slwi. r6, r6, 5 ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro HFilter V
- vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
- vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
-
- vmsummbm v24, v20, v24, v18
- vmsummbm v25, v20, v25, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
-
- vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 9 bytes wide, output is 8 bytes.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
-
- HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
- stvx \V, 0, r7
-
-.if \increment_counter
- add r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
- vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
- vadduhm v22, v18, v22
- vmuloub v23, \P0, v20
- vadduhm v23, v18, v23
-
- vmuleub v24, \P1, v21
- vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
- vmuloub v25, \P1, v21
- vadduhm v23, v23, v25 ;# Ro = odds
-
- vsrh v22, v22, v19 ;# divide by 128
- vsrh v23, v23, v19 ;# v16 v17 = evens, odds
- vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
- vmrglh v23, v22, v23
- vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
- stvx \V, 0, r1
- lwz \R, 0(r1)
- stw \R, 0(r7)
- lwz \R, 4(r1)
- stw \R, 4(r7)
- add \D, \D, \P
-.endm
-
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_4x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_4x4_b
-
- hfilter_8 v4, 0
-
- b second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
-
-second_pass_4x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-store_out_4x4_b:
-
- stvx v0, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v1, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v2, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v3, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
-
-exit_4x4:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_8x4_b
-
- hfilter_8 v4, 0
-
- b second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
-
-second_pass_8x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-store_out_8x4_b:
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x4_b
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
-
- b exit_8x4
-
-store_aligned_8x4_b:
- load_c v10, b_hilo_b, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
-
-exit_8x4:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff0
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x8_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
- hfilter_8 v4, 1
- hfilter_8 v5, 1
- hfilter_8 v6, 1
- hfilter_8 v7, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_8x8_b
-
- hfilter_8 v8, 0
-
- b second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
- load_and_align_8 v5, 1
- load_and_align_8 v6, 1
- load_and_align_8 v7, 1
- load_and_align_8 v8, 0
-
-second_pass_8x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-store_out_8x8_b:
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x8_b
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
-
- b exit_8x8
-
-store_aligned_8x8_b:
- load_c v10, b_hilo_b, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
-
-exit_8x8:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
- lvx v23, r12, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
- vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
-
- ;# set 0
- vmsummbm v24, v20, v21, v18 ;# taps times elements
-
- ;# set 1
- vsldoi v23, v21, v22, 1
- vmsummbm v25, v20, v23, v18
-
- ;# set 2
- vsldoi v23, v21, v22, 2
- vmsummbm v26, v20, v23, v18
-
- ;# set 3
- vsldoi v23, v21, v22, 3
- vmsummbm v27, v20, v23, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
- vsrh v25, v25, v19
-
- vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
- vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
- stvx \V, 0, r7
-
-.if \increment_counter
- add r7, r7, r8
-.endif
-.endm
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- HProlog second_pass_16x16_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
- hfilter_16 v8, 1
- hfilter_16 v9, 1
- hfilter_16 v10, 1
- hfilter_16 v11, 1
- hfilter_16 v12, 1
- hfilter_16 v13, 1
- hfilter_16 v14, 1
- hfilter_16 v15, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_16x16_b
-
- hfilter_16 v16, 0
-
- b second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, 1
- load_and_align_16 v1, 1
- load_and_align_16 v2, 1
- load_and_align_16 v3, 1
- load_and_align_16 v4, 1
- load_and_align_16 v5, 1
- load_and_align_16 v6, 1
- load_and_align_16 v7, 1
- load_and_align_16 v8, 1
- load_and_align_16 v9, 1
- load_and_align_16 v10, 1
- load_and_align_16 v11, 1
- load_and_align_16 v12, 1
- load_and_align_16 v13, 1
- load_and_align_16 v14, 1
- load_and_align_16 v15, 1
- load_and_align_16 v16, 0
-
-second_pass_16x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-store_out_16x16_b:
-
- write_16 v0, 1
- write_16 v1, 1
- write_16 v2, 1
- write_16 v3, 1
- write_16 v4, 1
- write_16 v5, 1
- write_16 v6, 1
- write_16 v7, 1
- write_16 v8, 1
- write_16 v9, 1
- write_16 v10, 1
- write_16 v11, 1
- write_16 v12, 1
- write_16 v13, 1
- write_16 v14, 1
- write_16 v15, 0
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-hfilter_b:
- .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
- .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
- .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
- .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
- .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
- .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
- .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
- .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
-
- .align 4
-vfilter_b:
- .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
- .align 4
-b_hperm_b:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-b_0123_b:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-b_4567_b:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
-b_hilo_b:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm
deleted file mode 100644
index 117d9cfc8e8..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null
@@ -1,189 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
- .align 2
-short_idct4x4llm_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- load_c v8, sinpi8sqrt2, 0, r9, r10
- load_c v9, cospi8sqrt2minus1, 0, r9, r10
- load_c v10, hi_hi, 0, r9, r10
- load_c v11, lo_lo, 0, r9, r10
- load_c v12, shift_16, 0, r9, r10
-
- li r10, 16
- lvx v0, 0, r3 ;# input ip[0], ip[ 4]
- lvx v1, r10, r3 ;# input ip[8], ip[12]
-
- ;# first pass
- vupkhsh v2, v0
- vupkhsh v3, v1
- vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
- vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
-
- vupklsh v0, v0
- vmulosh v4, v0, v8
- vsraw v4, v4, v12
- vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
- vupklsh v1, v1
- vmulosh v5, v1, v9
- vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v1
-
- vsubsws v4, v4, v5 ;# c1
-
- vmulosh v3, v1, v8
- vsraw v3, v3, v12
- vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v0, v9
- vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v0
-
- vaddsws v3, v3, v5 ;# d1
-
- vaddsws v0, v6, v3 ;# a1 + d1
- vsubsws v3, v6, v3 ;# a1 - d1
-
- vaddsws v1, v7, v4 ;# b1 + c1
- vsubsws v2, v7, v4 ;# b1 - c1
-
- ;# transpose input
- vmrghw v4, v0, v1 ;# a0 b0 a1 b1
- vmrghw v5, v2, v3 ;# c0 d0 c1 d1
-
- vmrglw v6, v0, v1 ;# a2 b2 a3 b3
- vmrglw v7, v2, v3 ;# c2 d2 c3 d3
-
- vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
- vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
-
- vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
- vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
-
- ;# second pass
- vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
- vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
-
- vmulosh v4, v1, v8
- vsraw v4, v4, v12
- vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v3, v9
- vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v3
-
- vsubsws v4, v4, v5 ;# c1
-
- vmulosh v2, v3, v8
- vsraw v2, v2, v12
- vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v1, v9
- vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v1
-
- vaddsws v3, v2, v5 ;# d1
-
- vaddsws v0, v6, v3 ;# a1 + d1
- vsubsws v3, v6, v3 ;# a1 - d1
-
- vaddsws v1, v7, v4 ;# b1 + c1
- vsubsws v2, v7, v4 ;# b1 - c1
-
- vspltish v6, 4
- vspltish v7, 3
-
- vpkswss v0, v0, v1
- vpkswss v1, v2, v3
-
- vaddshs v0, v0, v6
- vaddshs v1, v1, v6
-
- vsrah v0, v0, v7
- vsrah v1, v1, v7
-
- ;# transpose output
- vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
- vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
-
- vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
- vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
-
- stwu r1,-416(r1) ;# create space on the stack
-
- stvx v0, 0, r1
- lwz r6, 0(r1)
- stw r6, 0(r4)
- lwz r6, 4(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- lwz r6, 8(r1)
- stw r6, 0(r4)
- lwz r6, 12(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- stvx v1, 0, r1
- lwz r6, 0(r1)
- stw r6, 0(r4)
- lwz r6, 4(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- lwz r6, 8(r1)
- stw r6, 0(r4)
- lwz r6, 12(r1)
- stw r6, 4(r4)
-
- addi r1, r1, 416 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 4
-sinpi8sqrt2:
- .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
- .align 4
-cospi8sqrt2minus1:
- .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
- .align 4
-shift_16:
- .long 16, 16, 16, 16
-
- .align 4
-hi_hi:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-
- .align 4
-lo_lo:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c
deleted file mode 100644
index 71bf6e2d759..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
- unsigned char *s, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
- unsigned char *u, // source pointer
- unsigned char *v, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
- unsigned char *s, // source pointer
- int p, // pitch
- const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- // These should all be done at once with one call, instead of 3
- loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
- loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
- loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
- loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
- loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
deleted file mode 100644
index 61df4e97639..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null
@@ -1,1253 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl mbloop_filter_horizontal_edge_y_ppc
- .globl loop_filter_horizontal_edge_y_ppc
- .globl mbloop_filter_vertical_edge_y_ppc
- .globl loop_filter_vertical_edge_y_ppc
-
- .globl mbloop_filter_horizontal_edge_uv_ppc
- .globl loop_filter_horizontal_edge_uv_ppc
- .globl mbloop_filter_vertical_edge_uv_ppc
- .globl loop_filter_vertical_edge_uv_ppc
-
- .globl loop_filter_simple_horizontal_edge_ppc
- .globl loop_filter_simple_vertical_edge_ppc
-
- .text
-;# We often need to perform transposes (and other transpose-like operations)
-;# on matrices of data. This is simplified by the fact that we usually
-;# operate on hunks of data whose dimensions are powers of 2, or at least
-;# divisible by highish powers of 2.
-;#
-;# These operations can be very confusing. They become more straightforward
-;# when we think of them as permutations of address bits: Concatenate a
-;# group of vector registers and think of it as occupying a block of
-;# memory beginning at address zero. The low four bits 0...3 of the
-;# address then correspond to position within a register, the higher-order
-;# address bits select the register.
-;#
-;# Although register selection, at the code level, is arbitrary, things
-;# are simpler if we use contiguous ranges of register numbers, simpler
-;# still if the low-order bits of the register number correspond to
-;# conceptual address bits. We do this whenever reasonable.
-;#
-;# A 16x16 transpose can then be thought of as an operation on
-;# a 256-element block of memory. It takes 8 bits 0...7 to address this
-;# memory and the effect of a transpose is to interchange address bit
-;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the
-;# column, which is interchanged with the row addressed by bits 4..7.
-;#
-;# The altivec merge instructions provide a rapid means of effecting
-;# many of these transforms. They operate at three widths (8,16,32).
-;# Writing V(x) for vector register #x, paired merges permute address
-;# indices as follows.
-;#
-;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:
-;#
-;# vmrghb V( x), V( y), V( y + (1<<s))
-;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# =0= 1->2 2->3 3->(4+d) (4+s)->1:
-;#
-;# vmrghh V( x), V( y), V( y + (1<<s))
-;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# =0= =1= 2->3 3->(4+d) (4+s)->2:
-;#
-;# vmrghw V( x), V( y), V( y + (1<<s))
-;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# Unfortunately, there is no doubleword merge instruction.
-;# The following sequence uses "vperm" is a substitute.
-;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;# are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:
-;#
-;# vperm V( x), V( y), V( y + (1<<s)), Vhihi
-;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;# Except for bits s and d, the other relationships between register
-;# number (= high-order part of address) bits are at the disposal of
-;# the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;# edges together. This requires a single 16x16 transpose, which, in
-;# the above language, amounts to the following permutation of address
-;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by
-;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;# Except for the fact that the destination registers get written
-;# before we are done referencing the old contents, the cyclic transform
-;# is effected by
-;#
-;# x = 0; do {
-;# vmrghb V(2x), V(x), V(x+8);
-;# vmrghb V(2x+1), V(x), V(x+8);
-;# } while( ++x < 8);
-;#
-;# For clarity, and because we can afford it, we do this transpose
-;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,
-;# leaving the final result in 16 .. 31, as the lower registers are
-;# used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
- vmrghb \A, \X, \Y
- vmrglb \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
- Tpair v16,v17, v0,v8
- Tpair v18,v19, v1,v9
- Tpair v20,v21, v2,v10
- Tpair v22,v23, v3,v11
- Tpair v24,v25, v4,v12
- Tpair v26,v27, v5,v13
- Tpair v28,v29, v6,v14
- Tpair v30,v31, v7,v15
-.endm
-
-.macro t16_odd
- Tpair v0,v1, v16,v24
- Tpair v2,v3, v17,v25
- Tpair v4,v5, v18,v26
- Tpair v6,v7, v19,v27
- Tpair v8,v9, v20,v28
- Tpair v10,v11, v21,v29
- Tpair v12,v13, v22,v30
- Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
- t16_odd
- t16_even
- t16_odd
- t16_even
-.endm
-
-;# Vertical edge filtering requires transposes. For the simple filter,
-;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;# each. Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;# v0 = 0 1 ... 14 15
-;# v1 = 16 17 ... 30 31
-;# v2 = 32 33 ... 47 48
-;# v3 = 49 50 ... 62 63
-;#
-;# In frame-buffer memory, the layout is:
-;#
-;# 0 16 32 48
-;# 1 17 33 49
-;# ...
-;# 15 31 47 63.
-;#
-;# We begin by reading the data 32 bits at a time (using scalar operations)
-;# into a temporary array, reading the rows of the array into vector registers,
-;# with the following layout:
-;#
-;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60
-;# v1 = 1 17 33 49 5 21 ... 45 61
-;# v2 = 2 18 ... 46 62
-;# v3 = 3 19 ... 47 63
-;#
-;# From the "address-bit" perspective discussed above, we simply need to
-;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;# In other words, we transpose each of the four 4x4 submatrices.
-;#
-;# This transformation is its own inverse, and we need to perform it
-;# again before writing the pixels back into the frame buffer.
-;#
-;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;# defined above. We think of both groups of 4 registers as having
-;# "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
- ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=
-
- vmrghb v4, v0, v1
- vmrglb v5, v0, v1
- vmrghb v6, v2, v3
- vmrglb v7, v2, v3
-
- ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1
-
- vmrghh v0, v4, v6
- vmrglh v1, v4, v6
- vmrghh v2, v5, v7
- vmrglh v3, v5, v7
-
- ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=
-
- vmrghw v4, v0, v1
- vmrglw v5, v0, v1
- vmrghw v6, v2, v3
- vmrglw v7, v2, v3
-
- ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3
-
- vperm v0, v4, v6, \Vlo
- vperm v1, v4, v6, \Vhi
- vperm v2, v5, v7, \Vlo
- vperm v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;# We read 8 columns of data, initially in the following pattern:
-;#
-;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)
-;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)
-;# ...
-;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;# and wish to convert to:
-;#
-;# (0,0) ... (0,15)
-;# (1,0) ... (1,15)
-;# ...
-;# (7,0) ... (7,15).
-;#
-;# In "address bit" language, we wish to map
-;#
-;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.
-;#
-;# This can be accomplished by 4 iterations of the cyclic transform
-;#
-;# I -> (I+1) mod 7;
-;#
-;# each iteration can be realized by (d=0, s=2):
-;#
-;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);
-;#
-;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;
-;# preserving v8 = sign converter.
-;#
-;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;# result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
- Tpair v10, v11, v0, v4
- Tpair v12, v13, v1, v5
- Tpair v14, v15, v2, v6
- Tpair v16, v17, v3, v7
-.endm
-
-.macro t8x16_even
- Tpair v0, v1, v10, v14
- Tpair v2, v3, v11, v15
- Tpair v4, v5, v12, v16
- Tpair v6, v7, v13, v17
-.endm
-
-.macro transpose8x16_fwd
- t8x16_odd
- t8x16_even
- t8x16_odd
- t8x16_even
-.endm
-
-.macro transpose8x16_inv
- t8x16_odd
- t8x16_even
- t8x16_odd
-.endm
-
-.macro Transpose16x16
- vmrghb v0, v16, v24
- vmrglb v1, v16, v24
- vmrghb v2, v17, v25
- vmrglb v3, v17, v25
- vmrghb v4, v18, v26
- vmrglb v5, v18, v26
- vmrghb v6, v19, v27
- vmrglb v7, v19, v27
- vmrghb v8, v20, v28
- vmrglb v9, v20, v28
- vmrghb v10, v21, v29
- vmrglb v11, v21, v29
- vmrghb v12, v22, v30
- vmrglb v13, v22, v30
- vmrghb v14, v23, v31
- vmrglb v15, v23, v31
- vmrghb v16, v0, v8
- vmrglb v17, v0, v8
- vmrghb v18, v1, v9
- vmrglb v19, v1, v9
- vmrghb v20, v2, v10
- vmrglb v21, v2, v10
- vmrghb v22, v3, v11
- vmrglb v23, v3, v11
- vmrghb v24, v4, v12
- vmrglb v25, v4, v12
- vmrghb v26, v5, v13
- vmrglb v27, v5, v13
- vmrghb v28, v6, v14
- vmrglb v29, v6, v14
- vmrghb v30, v7, v15
- vmrglb v31, v7, v15
- vmrghb v0, v16, v24
- vmrglb v1, v16, v24
- vmrghb v2, v17, v25
- vmrglb v3, v17, v25
- vmrghb v4, v18, v26
- vmrglb v5, v18, v26
- vmrghb v6, v19, v27
- vmrglb v7, v19, v27
- vmrghb v8, v20, v28
- vmrglb v9, v20, v28
- vmrghb v10, v21, v29
- vmrglb v11, v21, v29
- vmrghb v12, v22, v30
- vmrglb v13, v22, v30
- vmrghb v14, v23, v31
- vmrglb v15, v23, v31
- vmrghb v16, v0, v8
- vmrglb v17, v0, v8
- vmrghb v18, v1, v9
- vmrglb v19, v1, v9
- vmrghb v20, v2, v10
- vmrglb v21, v2, v10
- vmrghb v22, v3, v11
- vmrglb v23, v3, v11
- vmrghb v24, v4, v12
- vmrglb v25, v4, v12
- vmrghb v26, v5, v13
- vmrglb v27, v5, v13
- vmrghb v28, v6, v14
- vmrglb v29, v6, v14
- vmrghb v30, v7, v15
- vmrglb v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;# into vector register Vreg. Trashes r0
-.macro load_g Vreg, Gptr
- lwz r0, \Gptr
- lvx \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here. if the answer is negative
-;# it will be clamped to 0. orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
- vsububs \RES, \A, \B
- vsububs \TMP, \B, \A
- vor \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
- vsububs \TMP, \A, \B
- vmaxub \RES, \RES, \TMP
- vsububs \TMP, \B, \A
- vmaxub \RES, \RES, \TMP
-.endm
-
-.macro Masks
- ;# build masks
- ;# input is all 8 bit unsigned (0-255). need to
- ;# do abs(vala-valb) > limit. but no need to compare each
- ;# value to the limit. find the max of the absolute differences
- ;# and compare that to the limit.
- ;# First hev
- Abs v14, v13, v2, v3 ;# |P1 - P0|
- max_abs v14, v13, v5, v4 ;# |Q1 - Q0|
-
- vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded
-
- ;# Next limit
- max_abs v14, v13, v0, v1 ;# |P3 - P2|
- max_abs v14, v13, v1, v2 ;# |P2 - P1|
- max_abs v14, v13, v6, v5 ;# |Q2 - Q1|
- max_abs v14, v13, v7, v6 ;# |Q3 - Q2|
-
- vcmpgtub v9, v14, v9 ;# R = true if limit exceeded
-
- ;# flimit
- Abs v14, v13, v3, v4 ;# |P0 - Q0|
-
- vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded
-
- vor v8, v8, v9 ;# R = true if flimit or limit exceeded
- ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
- ;# build constants
- lvx \FL, 0, \RFL ;# flimit
- lvx \LI, 0, \RLI ;# limit
- lvx \TH, 0, \RTH ;# thresh
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
- ;# setup strides/pointers to be able to access
- ;# all of the data
- add r5, r4, r4 ;# r5 = 2 * stride
- sub r6, r3, r5 ;# r6 -> 2 rows back
- neg r7, r4 ;# r7 = -stride
-
- ;# load 16 pixels worth of data to work on
- sub r0, r6, r5 ;# r0 -> 4 rows back (temp)
- lvx v0, 0, r0 ;# P3 (read only)
- lvx v1, r7, r6 ;# P2
- lvx v2, 0, r6 ;# P1
- lvx v3, r7, r3 ;# P0
- lvx v4, 0, r3 ;# Q0
- lvx v5, r4, r3 ;# Q1
- lvx v6, r5, r3 ;# Q2
- add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)
- lvx v7, r4, r0 ;# Q3 (read only)
-.endm
-
-;# Expects
-;# v10 == HEV
-;# v13 == tmp
-;# v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
- vxor \P1, \P1, v11 ;# SP1
- vxor \P0, \P0, v11 ;# SP0
- vxor \Q0, \Q0, v11 ;# SQ0
- vxor \Q1, \Q1, v11 ;# SQ1
-
- vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)
-.if \HEV_PRESENT
- vand v13, v13, v10 ;# f &= hev
-.endif
- vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
- vandc v13, v13, v8 ;# f &= mask
-
- vspltisb v8, 3
- vspltisb v9, 4
-
- vaddsbs v14, v13, v9 ;# f1 = c (f+4)
- vaddsbs v15, v13, v8 ;# f2 = c (f+3)
-
- vsrab v13, v14, v8 ;# f1 >>= 3
- vsrab v15, v15, v8 ;# f2 >>= 3
-
- vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)
- vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
- Masks
-
- ;# start the fitering here
- vxor v1, v1, v11 ;# SP2
- vxor v2, v2, v11 ;# SP1
- vxor v3, v3, v11 ;# SP0
- vxor v4, v4, v11 ;# SQ0
- vxor v5, v5, v11 ;# SQ1
- vxor v6, v6, v11 ;# SQ2
-
- ;# add outer taps if we have high edge variance
- vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)
-
- vsubsbs v14, v4, v3 ;# SQ0-SP0
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
- vandc v13, v13, v8 ;# f &= mask
- vand v15, v13, v10 ;# f2 = f & hev
-
- ;# save bottom 3 bits so that we round one side +4 and the other +3
- vspltisb v8, 3
- vspltisb v9, 4
-
- vaddsbs v14, v15, v9 ;# f1 = c (f+4)
- vaddsbs v15, v15, v8 ;# f2 = c (f+3)
-
- vsrab v14, v14, v8 ;# f1 >>= 3
- vsrab v15, v15, v8 ;# f2 >>= 3
-
- vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)
- vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)
-
- ;# only apply wider filter if not high edge variance
- vandc v13, v13, v10 ;# f &= ~hev
-
- vspltisb v9, 2
- vnor v8, v8, v8
- vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
- vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f
- vspltisb v8, 9
-
- ;# roughly 1/7th difference across boundary
- vspltish v10, 7
- vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v8, v13
- vaddshs v14, v14, v9 ;# += 63
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v6, v6, v10 ;# subtract from Q and add to P
- vaddsbs v1, v1, v10
-
- vxor v6, v6, v11
- vxor v1, v1, v11
-
- ;# roughly 2/7th difference across boundary
- vspltish v10, 7
- vaddubm v12, v8, v8
- vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v12, v13
- vaddshs v14, v14, v9
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v5, v5, v10 ;# subtract from Q and add to P
- vaddsbs v2, v2, v10
-
- vxor v5, v5, v11
- vxor v2, v2, v11
-
- ;# roughly 3/7th difference across boundary
- vspltish v10, 7
- vaddubm v12, v12, v8
- vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v12, v13
- vaddshs v14, v14, v9
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v4, v4, v10 ;# subtract from Q and add to P
- vaddsbs v3, v3, v10
-
- vxor v4, v4, v11
- vxor v3, v3, v11
-.endm
-
-.macro SBFilter
- Masks
-
- common_adjust v3, v4, v2, v5, 1
-
- ;# outer tap adjustments
- vspltisb v8, 1
-
- vaddubm v13, v13, v8 ;# f += 1
- vsrab v13, v13, v8 ;# f >>= 1
-
- vandc v13, v13, v10 ;# f &= ~hev
-
- vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)
- vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)
-
- vxor v2, v2, v11
- vxor v3, v3, v11
- vxor v4, v4, v11
- vxor v5, v5, v11
-.endm
-
- .align 2
-mbloop_filter_horizontal_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r5, r6, r7, v8, v9, v10
-
- load_data_y
-
- vp8_mbfilter
-
- stvx v1, r7, r6 ;# P2
- stvx v2, 0, r6 ;# P1
- stvx v3, r7, r3 ;# P0
- stvx v4, 0, r3 ;# Q0
- stvx v5, r4, r3 ;# Q1
- stvx v6, r5, r3 ;# Q2
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r5, r6, r7, v8, v9, v10
-
- load_data_y
-
- SBFilter
-
- stvx v2, 0, r6 ;# P1
- stvx v3, r7, r3 ;# P0
- stvx v4, 0, r3 ;# Q0
- stvx v5, r4, r3 ;# Q1
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.
-;# So we can read in an entire mb aligned. However if we want to filter the mb
-;# edge we run into problems. For the loopfilter we require 4 bytes before the mb
-;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit
-;# of a waste. So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;# memory to align and order them up. Then they are read in using the
-;# vector register file.
-.macro RLVmb V, R
- lwzux r0, r3, r4
- stw r0, 4(\R)
- lwz r0,-4(r3)
- stw r0, 0(\R)
- lwzux r0, r3, r4
- stw r0,12(\R)
- lwz r0,-4(r3)
- stw r0, 8(\R)
- lvx \V, 0, \R
-.endm
-
-.macro WLVmb V, R
- stvx \V, 0, \R
- lwz r0,12(\R)
- stwux r0, r3, r4
- lwz r0, 8(\R)
- stw r0,-4(r3)
- lwz r0, 4(\R)
- stwux r0, r3, r4
- lwz r0, 0(\R)
- stw r0,-4(r3)
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
- sub r3, r3, r4
-
- RLVmb v0, r9
- RLVmb v1, r9
- RLVmb v2, r9
- RLVmb v3, r9
- RLVmb v4, r9
- RLVmb v5, r9
- RLVmb v6, r9
- RLVmb v7, r9
-
- transpose8x16_fwd
-
- build_constants r5, r6, r7, v8, v9, v10
-
- vp8_mbfilter
-
- transpose8x16_inv
-
- add r3, r3, r4
- neg r4, r4
-
- WLVmb v17, r9
- WLVmb v16, r9
- WLVmb v15, r9
- WLVmb v14, r9
- WLVmb v13, r9
- WLVmb v12, r9
- WLVmb v11, r9
- WLVmb v10, r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro RL V, R, P
- lvx \V, 0, \R
- add \R, \R, \P
-.endm
-
-.macro WL V, R, P
- stvx \V, 0, \R
- add \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
- ;# K = |P0-P1| already
- Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|
- vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)
- vcmpgtub v10, v14, v0
-
- Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]
-
- max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)
- max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)
- max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)
-
- vmaxub v14, v14, v4 ;# M = max interior abs diff
- vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded
-
- Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)
- vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded
- vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded
-
- ;# replace P1,Q1 w/signed versions
- common_adjust \P0, \Q0, \P1, \Q1, 1
-
- vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant
- vsrab v13, v13, v1
- vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev
- vsubsbs \Q1, \Q1, v13
- vaddsbs \P1, \P1, v13
-
- vxor \P1, \P1, v11 ;# P1
- vxor \P0, \P0, v11 ;# P0
- vxor \Q0, \Q0, v11 ;# Q0
- vxor \Q1, \Q1, v11 ;# Q1
-.endm
-
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- addi r9, r3, 0
- RL v16, r9, r4
- RL v17, r9, r4
- RL v18, r9, r4
- RL v19, r9, r4
- RL v20, r9, r4
- RL v21, r9, r4
- RL v22, r9, r4
- RL v23, r9, r4
- RL v24, r9, r4
- RL v25, r9, r4
- RL v26, r9, r4
- RL v27, r9, r4
- RL v28, r9, r4
- RL v29, r9, r4
- RL v30, r9, r4
- lvx v31, 0, r9
-
- Transpose16x16
-
- vspltisb v1, 1
-
- build_constants r5, r6, r7, v3, v2, v0
-
- Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|
-
- Fil v16, v17, v18, v19, v20, v21, v22, v23
- Fil v20, v21, v22, v23, v24, v25, v26, v27
- Fil v24, v25, v26, v27, v28, v29, v30, v31
-
- Transpose16x16
-
- addi r9, r3, 0
- WL v16, r9, r4
- WL v17, r9, r4
- WL v18, r9, r4
- WL v19, r9, r4
- WL v20, r9, r4
- WL v21, r9, r4
- WL v22, r9, r4
- WL v23, r9, r4
- WL v24, r9, r4
- WL v25, r9, r4
- WL v26, r9, r4
- WL v27, r9, r4
- WL v28, r9, r4
- WL v29, r9, r4
- WL v30, r9, r4
- stvx v31, 0, r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
- andi. r7, r3, 8 ;# row origin modulo 16
- add r7, r7, r7 ;# selects selectors
- lis r12, _chromaSelectors@ha
- la r0, _chromaSelectors@l(r12)
- lwzux r0, r7, r0 ;# leave selector addr in r7
-
- lvx \V, 0, r0 ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
- lvx \U, \Offs, r3
- lvx \V, \Offs, r4
- vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
- vperm \U, \New, \U, \Umask ;# Combine new pels with siblings
- vperm \V, \New, \V, \Vmask
- stvx \U, \Offs, r3 ;# Write to frame buffer
- stvx \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
- neg r9, r5 ;# r9 = -1 * stride
- add r8, r9, r9 ;# r8 = -2 * stride
- add r10, r5, r5 ;# r10 = 2 * stride
-
- active_chroma_sel v12
-
- ;# P3, Q3 are read-only; need not save addresses or sibling pels
- add r6, r8, r8 ;# r6 = -4 * stride
- hread_uv v0, v14, v15, r6, v12
- add r6, r10, r5 ;# r6 = 3 * stride
- hread_uv v7, v14, v15, r6, v12
-
- ;# Others are read/write; save addresses and sibling pels
-
- add r6, r8, r9 ;# r6 = -3 * stride
- hread_uv v1, v16, v17, r6, v12
- hread_uv v2, v18, v19, r8, v12
- hread_uv v3, v20, v21, r9, v12
- hread_uv v4, v22, v23, 0, v12
- hread_uv v5, v24, v25, r5, v12
- hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
- load_g \V, 4(r7)
-.endm
-
-.macro vresult_sel V
- load_g \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
- uresult_sel v11
- vresult_sel v12
- hwrite_uv v2, v18, v19, r8, v11, v12
- hwrite_uv v3, v20, v21, r9, v11, v12
- hwrite_uv v4, v22, v23, 0, v11, v12
- hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r6, r7, r8, v8, v9, v10
-
- load_chroma_h
-
- vp8_mbfilter
-
- store_chroma_h
-
- hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2
- hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r6, r7, r8, v8, v9, v10
-
- load_chroma_h
-
- SBFilter
-
- store_chroma_h
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro R V, R
- lwzux r0, r3, r5
- stw r0, 4(\R)
- lwz r0,-4(r3)
- stw r0, 0(\R)
- lwzux r0, r4, r5
- stw r0,12(\R)
- lwz r0,-4(r4)
- stw r0, 8(\R)
- lvx \V, 0, \R
-.endm
-
-
-.macro W V, R
- stvx \V, 0, \R
- lwz r0,12(\R)
- stwux r0, r4, r5
- lwz r0, 8(\R)
- stw r0,-4(r4)
- lwz r0, 4(\R)
- stwux r0, r3, r5
- lwz r0, 0(\R)
- stw r0,-4(r3)
-.endm
-
-.macro chroma_vread R
- sub r3, r3, r5 ;# back up one line for simplicity
- sub r4, r4, r5
-
- R v0, \R
- R v1, \R
- R v2, \R
- R v3, \R
- R v4, \R
- R v5, \R
- R v6, \R
- R v7, \R
-
- transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
- transpose8x16_inv
-
- add r3, r3, r5
- add r4, r4, r5
- neg r5, r5 ;# Write rows back in reverse order
-
- W v17, \R
- W v16, \R
- W v15, \R
- W v14, \R
- W v13, \R
- W v12, \R
- W v11, \R
- W v10, \R
-.endm
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
-
- chroma_vread r9
-
- build_constants r6, r7, r8, v8, v9, v10
-
- vp8_mbfilter
-
- chroma_vwrite r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
-
- chroma_vread r9
-
- build_constants r6, r7, r8, v8, v9, v10
-
- SBFilter
-
- chroma_vwrite r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
- Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)
- vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit
-
- ;# preserve unsigned v0 and v3
- common_adjust v1, v2, v0, v3, 0
-
- vxor v1, v1, v11
- vxor v2, v2, v11 ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
- addi r8, 0, 16
- addi r7, r5, 32
-
- lvx v0, 0, r5
- lvx v1, r8, r5
- lvx v2, 0, r7
- lvx v3, r8, r7
-
- lis r12, _B_hihi@ha
- la r0, _B_hihi@l(r12)
- lvx v16, 0, r0
-
- lis r12, _B_lolo@ha
- la r0, _B_lolo@l(r12)
- lvx v17, 0, r0
-
- Transpose4times4x4 v16, v17
- vp8_simple_filter
-
- vxor v0, v0, v11
- vxor v3, v3, v11 ;# cvt Q0, P0 back to pels
-
- Transpose4times4x4 v16, v17
-
- stvx v0, 0, r5
- stvx v1, r8, r5
- stvx v2, 0, r7
- stvx v3, r8, r7
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- ;# build constants
- lvx v8, 0, r5 ;# flimit
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-
- neg r5, r4 ;# r5 = -1 * stride
- add r6, r5, r5 ;# r6 = -2 * stride
-
- lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge
- lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge
- lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge
- lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge
-
- vp8_simple_filter
-
- stvx v1, r5, r3 ;# store P0
- stvx v2, 0, r3 ;# store Q0
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro RLV Offs
- stw r0, (\Offs*4)(r5)
- lwzux r0, r7, r4
-.endm
-
-.macro WLV Offs
- lwz r0, (\Offs*4)(r5)
- stwux r0, r7, r4
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- ;# build constants
- lvx v8, 0, r5 ;# flimit
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-
- la r5, -96(r1) ;# temporary space for reading in vectors
-
- ;# Store 4 pels at word "Offs" in temp array, then advance r7
- ;# to next row and read another 4 pels from the frame buffer.
-
- subi r7, r3, 2 ;# r7 -> 2 pels before start
- lwzx r0, 0, r7 ;# read first 4 pels
-
- ;# 16 unaligned word accesses
- RLV 0
- RLV 4
- RLV 8
- RLV 12
- RLV 1
- RLV 5
- RLV 9
- RLV 13
- RLV 2
- RLV 6
- RLV 10
- RLV 14
- RLV 3
- RLV 7
- RLV 11
-
- stw r0, (15*4)(r5) ;# write last 4 pels
-
- simple_vertical
-
- ;# Read temp array, write frame buffer.
- subi r7, r3, 2 ;# r7 -> 2 pels before start
- lwzx r0, 0, r5 ;# read/write first 4 pels
- stwx r0, 0, r7
-
- WLV 4
- WLV 8
- WLV 12
- WLV 1
- WLV 5
- WLV 9
- WLV 13
- WLV 2
- WLV 6
- WLV 10
- WLV 14
- WLV 3
- WLV 7
- WLV 11
- WLV 15
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
-_chromaSelectors:
- .long _B_hihi
- .long _B_Ures0
- .long _B_Vres0
- .long 0
- .long _B_lolo
- .long _B_Ures8
- .long _B_Vres8
- .long 0
-
- .align 4
-_B_Vres8:
- .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15
-
- .align 4
-_B_Ures8:
- .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7
-
- .align 4
-_B_lolo:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
- .align 4
-_B_Vres0:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
- .align 4
-_B_Ures0:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-
- .align 4
-_B_hihi:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm
deleted file mode 100644
index f81d86f740e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl save_platform_context
- .globl restore_platform_context
-
-.macro W V P
- stvx \V, 0, \P
- addi \P, \P, 16
-.endm
-
-.macro R V P
- lvx \V, 0, \P
- addi \P, \P, 16
-.endm
-
-;# r3 context_ptr
- .align 2
-save_platform_contex:
- W v20, r3
- W v21, r3
- W v22, r3
- W v23, r3
- W v24, r3
- W v25, r3
- W v26, r3
- W v27, r3
- W v28, r3
- W v29, r3
- W v30, r3
- W v31, r3
-
- blr
-
-;# r3 context_ptr
- .align 2
-restore_platform_context:
- R v20, r3
- R v21, r3
- R v22, r3
- R v23, r3
- R v24, r3
- R v25, r3
- R v26, r3
- R v27, r3
- R v28, r3
- R v29, r3
- R v30, r3
- R v31, r3
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm
deleted file mode 100644
index dd39e05a836..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm
+++ /dev/null
@@ -1,175 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl recon4b_ppc
- .globl recon2b_ppc
- .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
- lvx v1, 0, \Pred ;# v1 = pred = p0..p15
- addi \Pred, \Pred, 16 ;# next pred
- vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
- lvx v3, 0, \Diff ;# v3 = d0..d7
- vaddshs v2, v2, v3 ;# v2 = r0..r7
- vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
- lvx v3, r8, \Diff ;# v3 = d8..d15
- addi \Diff, \Diff, 32 ;# next diff
- vaddshs v3, v3, v1 ;# v3 = r8..r15
- vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
- stvx v2, 0, \Dst ;# to dst
- add \Dst, \Dst, \Stride ;# next dst
-.endm
-
- .text
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-recon4b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
- li r8, 16
-
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
- lvx v1, 0, \Pred ;# v1 = pred = p0..p15
- vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
- lvx v3, 0, \Diff ;# v3 = d0..d7
- vaddshs v2, v2, v3 ;# v2 = r0..r7
- vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
- lvx v3, r8, \Diff ;# v2 = d8..d15
- vaddshs v3, v3, v1 ;# v3 = r8..r15
- vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
- stvx v2, 0, r10 ;# 2 rows to dst from buf
- lwz r0, 0(r10)
-.if \write_first_four_pels
- stw r0, 0(\Dst)
- .else
- stwux r0, \Dst, \Stride
-.endif
- lwz r0, 4(r10)
- stw r0, 4(\Dst)
- lwz r0, 8(r10)
- stwux r0, \Dst, \Stride ;# advance dst to next row
- lwz r0, 12(r10)
- stw r0, 4(\Dst)
-.endm
-
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-
-recon2b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
- li r8, 16
-
- la r10, -48(r1) ;# buf
-
- two_rows_of8 r3, r4, r5, r6, 1
-
- addi r4, r4, 16; ;# next pred
- addi r3, r3, 32; ;# next diff
-
- two_rows_of8 r3, r4, r5, r6, 0
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
-
-.macro get_two_diff_rows
- stw r0, 0(r10)
- lwz r0, 4(r3)
- stw r0, 4(r10)
- lwzu r0, 32(r3)
- stw r0, 8(r10)
- lwz r0, 4(r3)
- stw r0, 12(r10)
- lvx v3, 0, r10
-.endm
-
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-recon_b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
-
- la r10, -48(r1) ;# buf
-
- lwz r0, 0(r4)
- stw r0, 0(r10)
- lwz r0, 16(r4)
- stw r0, 4(r10)
- lwz r0, 32(r4)
- stw r0, 8(r10)
- lwz r0, 48(r4)
- stw r0, 12(r10)
-
- lvx v1, 0, r10; ;# v1 = pred = p0..p15
-
- lwz r0, 0(r3) ;# v3 = d0..d7
-
- get_two_diff_rows
-
- vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
- vaddshs v2, v2, v3; ;# v2 = r0..r7
-
- lwzu r0, 32(r3) ;# v3 = d8..d15
-
- get_two_diff_rows
-
- vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
- vaddshs v3, v3, v1; ;# v3 = r8..r15
-
- vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
- stvx v2, 0, r10; ;# 16 pels to dst from buf
-
- lwz r0, 0(r10)
- stw r0, 0(r5)
- lwz r0, 4(r10)
- stwux r0, r5, r6
- lwz r0, 8(r10)
- stwux r0, r5, r6
- lwz r0, 12(r10)
- stwx r0, r5, r6
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm
deleted file mode 100644
index e5f26380f96..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm
+++ /dev/null
@@ -1,277 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_sad16x16_ppc
- .globl vp8_sad16x8_ppc
- .globl vp8_sad8x16_ppc
- .globl vp8_sad8x8_ppc
- .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
- lvsl v3, 0, \R ;# permutate value for alignment
-
- lvx v1, 0, \R
- lvx v2, \O, \R
-
- vperm \V, v1, v2, v3
-.endm
-
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- li r10, 16 ;# load offset and loop counter
-
- vspltisw v8, 0 ;# zero out total to start
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
- ;# v6 = abs (v4 - v5)
- vsububs v6, v4, v5
- vsububs v7, v5, v4
- vor v6, v6, v7
-
- ;# v8 += abs (v4 - v5)
- vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
- lvsl v3, 0, r5 ;# only needs to be done once per block
-
- ;# preload a line of data before getting into the loop
- lvx v4, 0, r3
- lvx v1, 0, r5
- lvx v2, r10, r5
-
- add r5, r5, r6
- add r3, r3, r4
-
- vperm v5, v1, v2, v3
-
- .align 4
-\loop_label:
- ;# compute difference on first row
- vsububs v6, v4, v5
- vsububs v7, v5, v4
-
- ;# load up next set of data
- lvx v9, 0, r3
- lvx v1, 0, r5
- lvx v2, r10, r5
-
- ;# perform abs() of difference
- vor v6, v6, v7
- add r3, r3, r4
-
- ;# add to the running tally
- vsum4ubs v8, v6, v8
-
- ;# now onto the next line
- vperm v5, v1, v2, v3
- add r5, r5, r6
- lvx v4, 0, r3
-
- ;# compute difference on second row
- vsububs v6, v9, v5
- lvx v1, 0, r5
- vsububs v7, v5, v9
- lvx v2, r10, r5
- vor v6, v6, v7
- add r3, r3, r4
- vsum4ubs v8, v6, v8
- vperm v5, v1, v2, v3
- add r5, r5, r6
-
- bdnz \loop_label
-
- vspltisw v7, 0
-
- vsumsws v8, v8, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
- .align 4
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v6, r3, r10
- load_aligned_16 v7, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- vmrghb v4, v4, v6
- vmrghb v5, v5, v7
-
- SAD_16
-
- bdnz \loop_label
-
- vspltisw v7, 0
-
- vsumsws v8, v8, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- sad_16_loop sad16x16_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- sad_16_loop sad16x8_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- sad_8_loop sad8x16_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- sad_8_loop sad8x8_loop
-
- epilogue
-
- blr
-
-.macro transfer_4x4 I P
- lwz r0, 0(\I)
- add \I, \I, \P
-
- lwz r7, 0(\I)
- add \I, \I, \P
-
- lwz r8, 0(\I)
- add \I, \I, \P
-
- lwz r9, 0(\I)
-
- stw r0, 0(r1)
- stw r7, 4(r1)
- stw r8, 8(r1)
- stw r9, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
- prologue
-
- transfer_4x4 r3, r4
- lvx v4, 0, r1
-
- transfer_4x4 r5, r6
- lvx v5, 0, r1
-
- vspltisw v8, 0 ;# zero out total to start
-
- ;# v6 = abs (v4 - v5)
- vsububs v6, v4, v5
- vsububs v7, v5, v4
- vor v6, v6, v7
-
- ;# v8 += abs (v4 - v5)
- vsum4ubs v7, v6, v8
- vsumsws v7, v7, v8
-
- stvx v7, 0, r1
- lwz r3, 12(r1)
-
- epilogue
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c
deleted file mode 100644
index 6899c0e71cb..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_post_proc_down_and_across_mb_row)(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int cols,
- unsigned char *f,
- int size
-);
-
-extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp8_post_proc_down_and_across_mb_row_c
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int cols,
- unsigned char *f,
- int size
-);
-void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp8_copy_mem16x16;
-extern copy_mem_block_function *vp8_copy_mem8x8;
-extern copy_mem_block_function *vp8_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp8_sixtap_predict_c;
-extern subpixel_predict_function vp8_sixtap_predict8x4_c;
-extern subpixel_predict_function vp8_sixtap_predict8x8_c;
-extern subpixel_predict_function vp8_sixtap_predict16x16_c;
-extern subpixel_predict_function vp8_bilinear_predict4x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x8_c;
-extern subpixel_predict_function vp8_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp8_copy_mem16x16_c;
-extern copy_mem_block_function vp8_copy_mem8x8_c;
-extern copy_mem_block_function vp8_copy_mem8x4_c;
-
-void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp8_loop_filter_mbv_c;
-extern loop_filter_block_function vp8_loop_filter_bv_c;
-extern loop_filter_block_function vp8_loop_filter_mbh_c;
-extern loop_filter_block_function vp8_loop_filter_bh_c;
-
-extern loop_filter_block_function vp8_loop_filter_mbvs_c;
-extern loop_filter_block_function vp8_loop_filter_bvs_c;
-extern loop_filter_block_function vp8_loop_filter_mbhs_c;
-extern loop_filter_block_function vp8_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp8_clear_c(void)
-{
-}
-
-void vp8_machine_specific_config(void)
-{
- // Pure C:
- vp8_clear_system_state = vp8_clear_c;
- vp8_recon_b = vp8_recon_b_c;
- vp8_recon4b = vp8_recon4b_c;
- vp8_recon2b = vp8_recon2b_c;
-
- vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc;
- vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc;
- vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc;
- vp8_bilinear_predict = bilinear_predict4x4_ppc;
-
- vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc;
- vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc;
- vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc;
- vp8_sixtap_predict = sixtap_predict_ppc;
-
- vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c;
- vp8_short_idct4x4 = short_idct4x4llm_ppc;
- vp8_dc_only_idct = vp8_dc_only_idct_c;
-
- vp8_lf_mbvfull = loop_filter_mbv_ppc;
- vp8_lf_bvfull = loop_filter_bv_ppc;
- vp8_lf_mbhfull = loop_filter_mbh_ppc;
- vp8_lf_bhfull = loop_filter_bh_ppc;
-
- vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
- vp8_lf_bvsimple = loop_filter_bvs_ppc;
- vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
- vp8_lf_bhsimple = loop_filter_bhs_ppc;
-
- vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
- vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
- vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
- vp8_plane_add_noise = vp8_plane_add_noise_c;
-
- vp8_copy_mem16x16 = copy_mem16x16_ppc;
- vp8_copy_mem8x8 = vp8_copy_mem8x8_c;
- vp8_copy_mem8x4 = vp8_copy_mem8x4_c;
-
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm
deleted file mode 100644
index fb8d5bb1d9c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_get8x8var_ppc
- .globl vp8_get16x16var_ppc
- .globl vp8_mse16x16_ppc
- .globl vp8_variance16x16_ppc
- .globl vp8_variance16x8_ppc
- .globl vp8_variance8x16_ppc
- .globl vp8_variance8x8_ppc
- .globl vp8_variance4x4_ppc
-
-.macro load_aligned_16 V R O
- lvsl v3, 0, \R ;# permutate value for alignment
-
- lvx v1, 0, \R
- lvx v2, \O, \R
-
- vperm \V, v1, v2, v3
-.endm
-
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- li r10, 16 ;# load offset and loop counter
-
- vspltisw v7, 0 ;# zero for merging
- vspltisw v8, 0 ;# zero out total to start
- vspltisw v9, 0 ;# zero out total for dif^2
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
- ;# Compute sum first. Unpack to so signed subract
- ;# can be used. Only have a half word signed
- ;# subract. Do high, then low.
- vmrghb v2, v7, v4
- vmrghb v3, v7, v5
- vsubshs v2, v2, v3
- vsum4shs v8, v2, v8
-
- vmrglb v2, v7, v4
- vmrglb v3, v7, v5
- vsubshs v2, v2, v3
- vsum4shs v8, v2, v8
-
- ;# Now compute sse.
- vsububs v2, v4, v5
- vsububs v3, v5, v4
- vor v2, v2, v3
-
- vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- compute_sum_sse
-
- bdnz \loop_label
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
-.if \store_sum
- stw r3, 0(r8) ;# sum
-.endif
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, \DS ;# (sum*sum) >> DS
- subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v6, r3, r10
- load_aligned_16 v0, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- vmrghb v4, v4, v6
- vmrghb v5, v5, v0
-
- compute_sum_sse
-
- bdnz \loop_label
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
-.if \store_sum
- stw r3, 0(r8) ;# sum
-.endif
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, \DS ;# (sum*sum) >> 8
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- variance_8 6, get8x8var_loop, 1
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
- prologue
-
- mtctr r10
-
- variance_16 8, get16x16var_loop, 1
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
- prologue
-
- mtctr r10
-
-mse16x16_loop:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# Now compute sse.
- vsububs v2, v4, v5
- vsububs v3, v5, v4
- vor v2, v2, v3
-
- vmsumubm v9, v2, v2, v9
-
- bdnz mse16x16_loop
-
- vsumsws v9, v9, v7
-
- stvx v9, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r3, 12(r1)
-
- stw r3, 0(r7) ;# sse
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x16_ppc:
-
- prologue
-
- mtctr r10
-
- variance_16 8, variance16x16_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x8_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- variance_16 7, variance16x8_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- variance_8 7, variance8x16_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- variance_8 6, variance8x8_loop, 0
-
- epilogue
-
- blr
-
-.macro transfer_4x4 I P
- lwz r0, 0(\I)
- add \I, \I, \P
-
- lwz r10,0(\I)
- add \I, \I, \P
-
- lwz r8, 0(\I)
- add \I, \I, \P
-
- lwz r9, 0(\I)
-
- stw r0, 0(r1)
- stw r10, 4(r1)
- stw r8, 8(r1)
- stw r9, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance4x4_ppc:
-
- prologue
-
- transfer_4x4 r3, r4
- lvx v4, 0, r1
-
- transfer_4x4 r5, r6
- lvx v5, 0, r1
-
- compute_sum_sse
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, 4 ;# (sum*sum) >> 4
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
-
- epilogue
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
deleted file mode 100644
index 2308373a1d2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
+++ /dev/null
@@ -1,865 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_sub_pixel_variance4x4_ppc
- .globl vp8_sub_pixel_variance8x8_ppc
- .globl vp8_sub_pixel_variance8x16_ppc
- .globl vp8_sub_pixel_variance16x8_ppc
- .globl vp8_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
- load_c \V0, vfilter_b, r6, r12, r10
-
- addi r6, r6, 16
- lvx \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
- ;# load up horizontal filter
- slwi. r5, r5, 4 ;# index into horizontal filter array
-
- ;# index to the next set of vectors in the row.
- li r10, 16
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq \jump_label
-
- load_c v20, hfilter_b, r5, r12, r0
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v28, b_hperm_b, 0, r12, r0
-
- ;# index to the next set of vectors in the row.
- li r12, 32
-
- ;# rounding added in on the multiply
- vspltisw v21, 8
- vspltisw v18, 3
- vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
-
- slwi. r6, r6, 5 ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 9 bytes wide, output is 8 bytes.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
-
- vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
- vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
-
- vmsummbm v24, v20, v24, v18
- vmsummbm v25, v20, v25, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
-
- vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
- vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
- vadduhm v22, v18, v22
- vmuloub v23, \P0, v20
- vadduhm v23, v18, v23
-
- vmuleub v24, \P1, v21
- vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
- vmuloub v25, \P1, v21
- vadduhm v23, v23, v25 ;# Ro = odds
-
- vsrh v22, v22, v19 ;# divide by 128
- vsrh v23, v23, v19 ;# v16 v17 = evens, odds
- vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
- vmrglh v23, v22, v23
- vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
- ;# Compute sum first. Unpack to so signed subract
- ;# can be used. Only have a half word signed
- ;# subract. Do high, then low.
- vmrghb \t1, \z0, \src
- vmrghb \t2, \z0, \ref
- vsubshs \t1, \t1, \t2
- vsum4shs \sum, \t1, \sum
-
- vmrglb \t1, \z0, \src
- vmrglb \t2, \z0, \ref
- vsubshs \t1, \t1, \t2
- vsum4shs \sum, \t1, \sum
-
- ;# Now compute sse.
- vsububs \t1, \src, \ref
- vsububs \t2, \ref, \src
- vor \t1, \t1, \t2
-
- vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
- vsumsws \sum, \sum, \z0
- vsumsws \sse, \sse, \z0
-
- stvx \sum, 0, r1
- lwz r3, 12(r1)
-
- stvx \sse, 0, r1
- lwz r4, 12(r1)
-
- stw r4, 0(r9) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, \DS ;# (sum*sum) >> 8
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
- load_and_align_16 v16, r7, r8, \increment_counter
- compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
- lvsl v17, 0, \R ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, \R
- lvx v22, r10, \R
-
-.if \increment_counter
- add \R, \R, \P
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance4x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_4x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r12, r0
- load_c v11, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v10, v11, 1
- hfilter_8 v1, v10, v11, 1
- hfilter_8 v2, v10, v11, 1
- hfilter_8 v3, v10, v11, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_4x4_b
-
- hfilter_8 v4, v10, v11, 0
-
- b second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-compute_sum_sse_4x4_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- load_and_align_16 v4, r7, r8, 1
- load_and_align_16 v5, r7, r8, 1
- load_and_align_16 v6, r7, r8, 1
- load_and_align_16 v7, r7, r8, 1
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
-
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
-
- load_c v10, b_hilo_b, 0, r12, r0
-
- vperm v0, v0, v1, v10
- vperm v1, v2, v3, v10
-
- compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 4
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff0
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x8_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r12, r0
- load_c v11, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v10, v11, 1
- hfilter_8 v1, v10, v11, 1
- hfilter_8 v2, v10, v11, 1
- hfilter_8 v3, v10, v11, 1
- hfilter_8 v4, v10, v11, 1
- hfilter_8 v5, v10, v11, 1
- hfilter_8 v6, v10, v11, 1
- hfilter_8 v7, v10, v11, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_8x8_b
-
- hfilter_8 v8, v10, v11, 0
-
- b second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 0
-
- beq compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
-
- load_and_align_16 v4, r7, r8, 1
- load_and_align_16 v5, r7, r8, 1
- load_and_align_16 v6, r7, r8, 1
- load_and_align_16 v7, r7, r8, 1
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 0
-
- vmrghb v4, v4, v5
- vmrghb v5, v6, v7
- vmrghb v6, v8, v9
- vmrghb v7, v10, v11
-
- compute_sum_sse v0, v4, v18, v19, v20, v21, v23
- compute_sum_sse v1, v5, v18, v19, v20, v21, v23
- compute_sum_sse v2, v6, v18, v19, v20, v21, v23
- compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 6
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfffc
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x16_pre_copy_b
-
- ;# Load up permutation constants
- load_c v29, b_0123_b, 0, r12, r0
- load_c v30, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v29, v30, 1
- hfilter_8 v1, v29, v30, 1
- hfilter_8 v2, v29, v30, 1
- hfilter_8 v3, v29, v30, 1
- hfilter_8 v4, v29, v30, 1
- hfilter_8 v5, v29, v30, 1
- hfilter_8 v6, v29, v30, 1
- hfilter_8 v7, v29, v30, 1
- hfilter_8 v8, v29, v30, 1
- hfilter_8 v9, v29, v30, 1
- hfilter_8 v10, v29, v30, 1
- hfilter_8 v11, v29, v30, 1
- hfilter_8 v12, v29, v30, 1
- hfilter_8 v13, v29, v30, 1
- hfilter_8 v14, v29, v30, 1
- hfilter_8 v15, v29, v30, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_8x16_b
-
- hfilter_8 v16, v29, v30, 0
-
- b second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
- load_and_align_16 v9, r3, r4, 1
- load_and_align_16 v10, r3, r4, 1
- load_and_align_16 v11, r3, r4, 1
- load_and_align_16 v12, r3, r4, 1
- load_and_align_16 v13, r3, r4, 1
- load_and_align_16 v14, r3, r4, 1
- load_and_align_16 v15, r3, r4, 1
- load_and_align_16 v16, r3, r4, 0
-
- beq compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
- vmrghb v4, v8, v9
- vmrghb v5, v10, v11
- vmrghb v6, v12, v13
- vmrghb v7, v14, v15
-
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 1
- load_and_align_16 v12, r7, r8, 1
- load_and_align_16 v13, r7, r8, 1
- load_and_align_16 v14, r7, r8, 1
- load_and_align_16 v15, r7, r8, 1
-
- vmrghb v8, v8, v9
- vmrghb v9, v10, v11
- vmrghb v10, v12, v13
- vmrghb v11, v14, v15
-
- compute_sum_sse v0, v8, v18, v19, v20, v21, v23
- compute_sum_sse v1, v9, v18, v19, v20, v21, v23
- compute_sum_sse v2, v10, v18, v19, v20, v21, v23
- compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 1
- load_and_align_16 v12, r7, r8, 1
- load_and_align_16 v13, r7, r8, 1
- load_and_align_16 v14, r7, r8, 1
- load_and_align_16 v15, r7, r8, 0
-
- vmrghb v8, v8, v9
- vmrghb v9, v10, v11
- vmrghb v10, v12, v13
- vmrghb v11, v14, v15
-
- compute_sum_sse v4, v8, v18, v19, v20, v21, v23
- compute_sum_sse v5, v9, v18, v19, v20, v21, v23
- compute_sum_sse v6, v10, v18, v19, v20, v21, v23
- compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 7
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
- blr
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
- lvx v23, r12, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
- vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
-
- ;# set 0
- vmsummbm v24, v20, v21, v18 ;# taps times elements
-
- ;# set 1
- vsldoi v23, v21, v22, 1
- vmsummbm v25, v20, v23, v18
-
- ;# set 2
- vsldoi v23, v21, v22, 2
- vmsummbm v26, v20, v23, v18
-
- ;# set 3
- vsldoi v23, v21, v22, 3
- vmsummbm v27, v20, v23, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
- vsrh v25, v25, v19
-
- vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
- vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- HProlog second_pass_16x8_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_16x8_b
-
- hfilter_16 v8, 0
-
- b second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
-
- beq compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-compute_sum_sse_16x8_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- compute_sum_sse_16 v0, 1
- compute_sum_sse_16 v1, 1
- compute_sum_sse_16 v2, 1
- compute_sum_sse_16 v3, 1
- compute_sum_sse_16 v4, 1
- compute_sum_sse_16 v5, 1
- compute_sum_sse_16 v6, 1
- compute_sum_sse_16 v7, 0
-
- variance_final v18, v19, v23, 7
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- HProlog second_pass_16x16_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
- hfilter_16 v8, 1
- hfilter_16 v9, 1
- hfilter_16 v10, 1
- hfilter_16 v11, 1
- hfilter_16 v12, 1
- hfilter_16 v13, 1
- hfilter_16 v14, 1
- hfilter_16 v15, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_16x16_b
-
- hfilter_16 v16, 0
-
- b second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
- load_and_align_16 v9, r3, r4, 1
- load_and_align_16 v10, r3, r4, 1
- load_and_align_16 v11, r3, r4, 1
- load_and_align_16 v12, r3, r4, 1
- load_and_align_16 v13, r3, r4, 1
- load_and_align_16 v14, r3, r4, 1
- load_and_align_16 v15, r3, r4, 1
- load_and_align_16 v16, r3, r4, 0
-
- beq compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- compute_sum_sse_16 v0, 1
- compute_sum_sse_16 v1, 1
- compute_sum_sse_16 v2, 1
- compute_sum_sse_16 v3, 1
- compute_sum_sse_16 v4, 1
- compute_sum_sse_16 v5, 1
- compute_sum_sse_16 v6, 1
- compute_sum_sse_16 v7, 1
- compute_sum_sse_16 v8, 1
- compute_sum_sse_16 v9, 1
- compute_sum_sse_16 v10, 1
- compute_sum_sse_16 v11, 1
- compute_sum_sse_16 v12, 1
- compute_sum_sse_16 v13, 1
- compute_sum_sse_16 v14, 1
- compute_sum_sse_16 v15, 0
-
- variance_final v18, v19, v23, 8
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-hfilter_b:
- .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
- .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
- .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
- .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
- .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
- .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
- .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
- .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
-
- .align 4
-vfilter_b:
- .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
- .align 4
-b_hperm_b:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-b_0123_b:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-b_4567_b:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
-b_hilo_b:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c
index bac3c9474ee..e3025955871 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c
@@ -10,6 +10,8 @@
#include <limits.h>
+#include <string.h>
+
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vpx/vpx_integer.h"
@@ -30,31 +32,8 @@ void vp8_copy_mem16x16_c(
for (r = 0; r < 16; r++)
{
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
- dst[8] = src[8];
- dst[9] = src[9];
- dst[10] = src[10];
- dst[11] = src[11];
- dst[12] = src[12];
- dst[13] = src[13];
- dst[14] = src[14];
- dst[15] = src[15];
-
-#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
- ((uint32_t *)dst)[2] = ((uint32_t *)src)[2] ;
- ((uint32_t *)dst)[3] = ((uint32_t *)src)[3] ;
+ memcpy(dst, src, 16);
-#endif
src += src_stride;
dst += dst_stride;
@@ -72,19 +51,8 @@ void vp8_copy_mem8x8_c(
for (r = 0; r < 8; r++)
{
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-#endif
+ memcpy(dst, src, 8);
+
src += src_stride;
dst += dst_stride;
@@ -102,19 +70,8 @@ void vp8_copy_mem8x4_c(
for (r = 0; r < 4; r++)
{
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-#endif
+ memcpy(dst, src, 8);
+
src += src_stride;
dst += dst_stride;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c
index ec51ffe40d9..0a6c51b3531 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c
@@ -70,10 +70,10 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
expected_dc = 128;
}
- /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+ /*memset(ypred_ptr, expected_dc, 256);*/
for (r = 0; r < 16; r++)
{
- vpx_memset(ypred_ptr, expected_dc, 16);
+ memset(ypred_ptr, expected_dc, 16);
ypred_ptr += y_stride;
}
}
@@ -98,7 +98,7 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
for (r = 0; r < 16; r++)
{
- vpx_memset(ypred_ptr, yleft_col[r], 16);
+ memset(ypred_ptr, yleft_col[r], 16);
ypred_ptr += y_stride;
}
@@ -202,12 +202,12 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
}
- /*vpx_memset(upred_ptr,expected_udc,64);*/
- /*vpx_memset(vpred_ptr,expected_vdc,64);*/
+ /*memset(upred_ptr,expected_udc,64);*/
+ /*memset(vpred_ptr,expected_vdc,64);*/
for (i = 0; i < 8; i++)
{
- vpx_memset(upred_ptr, expected_udc, 8);
- vpx_memset(vpred_ptr, expected_vdc, 8);
+ memset(upred_ptr, expected_udc, 8);
+ memset(vpred_ptr, expected_vdc, 8);
upred_ptr += pred_stride;
vpred_ptr += pred_stride;
}
@@ -217,8 +217,8 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
{
for (i = 0; i < 8; i++)
{
- vpx_memcpy(upred_ptr, uabove_row, 8);
- vpx_memcpy(vpred_ptr, vabove_row, 8);
+ memcpy(upred_ptr, uabove_row, 8);
+ memcpy(vpred_ptr, vabove_row, 8);
upred_ptr += pred_stride;
vpred_ptr += pred_stride;
}
@@ -229,8 +229,8 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
{
for (i = 0; i < 8; i++)
{
- vpx_memset(upred_ptr, uleft_col[i], 8);
- vpx_memset(vpred_ptr, vleft_col[i], 8);
+ memset(upred_ptr, uleft_col[i], 8);
+ memset(vpred_ptr, vleft_col[i], 8);
upred_ptr += pred_stride;
vpred_ptr += pred_stride;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c
index 0b371b094aa..ab0e9b47fe8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c
@@ -7,15 +7,13 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#define RTCD_C
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
#include "vpx_ports/vpx_once.h"
-extern void vpx_scale_rtcd(void);
void vp8_rtcd()
{
- vpx_scale_rtcd();
once(setup_rtcd_internal);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
index c73ecf93f15..56b7db7ec33 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
@@ -304,88 +304,6 @@ $vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
#
-# Single block SAD
-#
-add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad4x4 mmx sse2 neon/;
-$vp8_sad4x4_sse2=vp8_sad4x4_wmt;
-
-add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad8x8 mmx sse2 neon/;
-$vp8_sad8x8_sse2=vp8_sad8x8_wmt;
-
-add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad8x16 mmx sse2 neon/;
-$vp8_sad8x16_sse2=vp8_sad8x16_wmt;
-
-add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad16x8 mmx sse2 neon/;
-$vp8_sad16x8_sse2=vp8_sad16x8_wmt;
-
-add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/;
-$vp8_sad16x16_sse2=vp8_sad16x16_wmt;
-$vp8_sad16x16_media=vp8_sad16x16_armv6;
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad4x4x3 sse3/;
-
-add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x8x3 sse3/;
-
-add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x16x3 sse3/;
-
-add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x8x3 sse3 ssse3/;
-
-add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x16x3 sse3 ssse3/;
-
-# Note the only difference in the following prototypes is that they return into
-# an array of short
-add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad4x4x8 sse4_1/;
-$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4;
-
-add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad8x8x8 sse4_1/;
-$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4;
-
-add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad8x16x8 sse4_1/;
-$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4;
-
-add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad16x8x8 sse4_1/;
-$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4;
-
-add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad16x16x8 sse4_1/;
-$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4;
-
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad4x4x4d sse3/;
-
-add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x8x4d sse3/;
-
-add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x16x4d sse3/;
-
-add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x8x4d sse3/;
-
-add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x16x4d sse3/;
-
-#
# Encoder functions below this point.
#
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
@@ -454,25 +372,7 @@ add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
specialize qw/vp8_regular_quantize_b sse2 sse4_1/;
add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
-$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
-$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon;
-
-add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-# no asm yet
-
-add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-specialize qw/vp8_fast_quantize_b_pair neon_asm/;
-$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon;
-
-add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
-specialize qw/vp8_quantize_mb neon/;
-
-add_proto qw/void vp8_quantize_mby/, "struct macroblock *";
-specialize qw/vp8_quantize_mby neon/;
-
-add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *";
-specialize qw/vp8_quantize_mbuv neon/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon/;
#
# Block subtraction
@@ -490,16 +390,13 @@ specialize qw/vp8_mbuverror mmx sse2/;
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon/;
-$vp8_subtract_b_media=vp8_subtract_b_armv6;
+specialize qw/vp8_subtract_b mmx sse2 neon/;
add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon/;
-$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
+specialize qw/vp8_subtract_mby mmx sse2 neon/;
add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
-$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
+specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
#
# Motion search
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c
deleted file mode 100644
index 5f36fc96e86..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <limits.h>
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int max_sad, int m, int n)
-{
- int r, c;
- unsigned int sad = 0;
-
- for (r = 0; r < n; r++)
- {
- for (c = 0; c < m; c++)
- {
- sad += abs(src_ptr[c] - ref_ptr[c]);
- }
-
- if (sad > max_sad)
- break;
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- return sad;
-}
-
-/* max_sad is provided as an optional optimization point. Alternative
- * implementations of these functions are not required to check it.
- */
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int max_sad)
-{
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
-}
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int max_sad)
-{
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
-}
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int max_sad)
-{
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
-
-}
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int max_sad)
-{
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
-}
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int max_sad)
-{
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
-}
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned short *sad_array)
-{
- sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
- sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
- sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
- sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
- sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
- sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned short *sad_array)
-{
- sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
- sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
- sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
- sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
- sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
- sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned short *sad_array)
-{
- sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
- sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
- sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
- sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
- sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
- sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned short *sad_array)
-{
- sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
- sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
- sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
- sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
- sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
- sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned short *sad_array)
-{
- sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
- sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
- sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
- sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
- sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
- sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
- sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
- sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char * const ref_ptr[], int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
- sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char * const ref_ptr[], int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
- sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char * const ref_ptr[], int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
- sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char * const ref_ptr[], int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
- sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
- const unsigned char * const ref_ptr[], int ref_stride,
- unsigned int *sad_array)
-{
- sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
- sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
- sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
- sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-/* Copy 2 macroblocks to a buffer */
-void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
- unsigned char *dst_ptr, int dst_stride,
- int height)
-{
- int r;
-
- for (r = 0; r < height; r++)
- {
-#if !(CONFIG_FAST_UNALIGNED)
- dst_ptr[0] = src_ptr[0];
- dst_ptr[1] = src_ptr[1];
- dst_ptr[2] = src_ptr[2];
- dst_ptr[3] = src_ptr[3];
- dst_ptr[4] = src_ptr[4];
- dst_ptr[5] = src_ptr[5];
- dst_ptr[6] = src_ptr[6];
- dst_ptr[7] = src_ptr[7];
- dst_ptr[8] = src_ptr[8];
- dst_ptr[9] = src_ptr[9];
- dst_ptr[10] = src_ptr[10];
- dst_ptr[11] = src_ptr[11];
- dst_ptr[12] = src_ptr[12];
- dst_ptr[13] = src_ptr[13];
- dst_ptr[14] = src_ptr[14];
- dst_ptr[15] = src_ptr[15];
- dst_ptr[16] = src_ptr[16];
- dst_ptr[17] = src_ptr[17];
- dst_ptr[18] = src_ptr[18];
- dst_ptr[19] = src_ptr[19];
- dst_ptr[20] = src_ptr[20];
- dst_ptr[21] = src_ptr[21];
- dst_ptr[22] = src_ptr[22];
- dst_ptr[23] = src_ptr[23];
- dst_ptr[24] = src_ptr[24];
- dst_ptr[25] = src_ptr[25];
- dst_ptr[26] = src_ptr[26];
- dst_ptr[27] = src_ptr[27];
- dst_ptr[28] = src_ptr[28];
- dst_ptr[29] = src_ptr[29];
- dst_ptr[30] = src_ptr[30];
- dst_ptr[31] = src_ptr[31];
-#else
- ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
- ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
- ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
- ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
- ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
- ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
- ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
- ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
-#endif
- src_ptr += src_stride;
- dst_ptr += dst_stride;
-
- }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c
index 60afe519f56..669564db42b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c
@@ -17,15 +17,15 @@ void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
int i;
/* set up frame new frame for intra coded blocks */
- vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
for (i = 0; i < ybf->y_height; i++)
ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
- vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
for (i = 0; i < ybf->uv_height; i++)
ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
- vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
for (i = 0; i < ybf->uv_height; i++)
ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
@@ -33,7 +33,7 @@ void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
{
- vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
- vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
- vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
index 89a32a72268..552a28025e6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
@@ -14,16 +14,17 @@
#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
#ifdef __cplusplus
extern "C" {
#endif
-typedef unsigned int(*vp8_sad_fn_t)(
- const unsigned char *src_ptr,
+typedef unsigned int(*vpx_sad_fn_t)(
+ const uint8_t *src_ptr,
int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int max_sad);
+ const uint8_t *ref_ptr,
+ int ref_stride);
typedef void (*vp8_copy32xn_fn_t)(
const unsigned char *src_ptr,
@@ -32,27 +33,17 @@ typedef void (*vp8_copy32xn_fn_t)(
int ref_stride,
int n);
-typedef void (*vp8_sad_multi_fn_t)(
+typedef void (*vpx_sad_multi_fn_t)(
const unsigned char *src_ptr,
int source_stride,
- const unsigned char *ref_ptr,
+ const unsigned char *ref_array,
int ref_stride,
unsigned int *sad_array);
-
-typedef void (*vp8_sad_multi1_fn_t)
- (
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array
- );
-
-typedef void (*vp8_sad_multi_d_fn_t)
+typedef void (*vpx_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
- const unsigned char * const ref_ptr[],
+ const unsigned char * const ref_array[],
int ref_stride,
unsigned int *sad_array
);
@@ -102,15 +93,15 @@ typedef unsigned int (*vp8_get16x16prederror_fn_t)
typedef struct variance_vtable
{
- vp8_sad_fn_t sdf;
+ vpx_sad_fn_t sdf;
vp8_variance_fn_t vf;
vp8_subpixvariance_fn_t svf;
vp8_variance_fn_t svf_halfpix_h;
vp8_variance_fn_t svf_halfpix_v;
vp8_variance_fn_t svf_halfpix_hv;
- vp8_sad_multi_fn_t sdx3f;
- vp8_sad_multi1_fn_t sdx8f;
- vp8_sad_multi_d_fn_t sdx4df;
+ vpx_sad_multi_fn_t sdx3f;
+ vpx_sad_multi_fn_t sdx8f;
+ vpx_sad_multi_d_fn_t sdx4df;
#if ARCH_X86 || ARCH_X86_64
vp8_copy32xn_fn_t copymem;
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm
new file mode 100644
index 00000000000..86fae269563
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm
@@ -0,0 +1,93 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_copy32xn_sse2(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;dst_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;dst_stride
+ movsxd rcx, dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ movdqu xmm2, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqu xmm4, XMMWORD PTR [rsi]
+ movdqu xmm5, XMMWORD PTR [rsi + 16]
+ movdqu xmm6, XMMWORD PTR [rsi + rax]
+ movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ movdqa XMMWORD PTR [rdi + rdx], xmm2
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+
+ lea rdi, [rdi+rdx*2]
+
+ movdqa XMMWORD PTR [rdi], xmm4
+ movdqa XMMWORD PTR [rdi + 16], xmm5
+ movdqa XMMWORD PTR [rdi + rdx], xmm6
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+
+ lea rdi, [rdi+rdx*2]
+
+ sub rcx, 4
+ cmp rcx, 4
+ jge .block_copy_sse2_loopx4
+
+ cmp rcx, 0
+ je .copy_is_done
+
+.block_copy_sse2_loop:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ lea rsi, [rsi+rax]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ lea rdi, [rdi+rdx]
+
+ sub rcx, 1
+ jne .block_copy_sse2_loop
+
+.copy_is_done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm
new file mode 100644
index 00000000000..d789a40ccf7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm
@@ -0,0 +1,146 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define max_sad arg(4)
+ %define height dword ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %if LIBVPX_YASM_WIN64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_sad [rsp+xmm_stack_space+8+4*8]
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define max_sad r8
+ %define height r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define max_sad
+ %define height
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %if LIBVPX_YASM_WIN64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+
+;void vp8_copy32xn_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+ lea end_ptr, [src_ptr+src_stride*2]
+
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
+ movdqu xmm4, XMMWORD PTR [end_ptr]
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16]
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
+
+ lea src_ptr, [src_ptr+src_stride*4]
+
+ lea end_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+ movdqa XMMWORD PTR [end_ptr], xmm4
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+ lea ref_ptr, [ref_ptr+ref_stride*4]
+
+ sub height, 4
+ cmp height, 4
+ jge .block_copy_sse3_loopx4
+
+ ;Check to see if there is more rows need to be copied.
+ cmp height, 0
+ je .copy_is_done
+
+.block_copy_sse3_loop:
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ lea src_ptr, [src_ptr+src_stride]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ lea ref_ptr, [ref_ptr+ref_stride]
+
+ sub height, 1
+ jne .block_copy_sse3_loop
+
+.copy_is_done:
+ STACK_FRAME_DESTROY_X3
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c
index a1e4ce6b329..f2532b34da2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -36,7 +36,7 @@ void vp8_dequant_idct_add_y_block_mmx
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
- vpx_memset(q, 0, 2 * sizeof(q[0]));
+ memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
@@ -45,7 +45,7 @@ void vp8_dequant_idct_add_y_block_mmx
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
dst+4, stride);
- vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+ memset(q + 16, 0, 2 * sizeof(q[0]));
}
if (eobs[2] > 1)
@@ -54,7 +54,7 @@ void vp8_dequant_idct_add_y_block_mmx
{
vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
dst+8, stride);
- vpx_memset(q + 32, 0, 2 * sizeof(q[0]));
+ memset(q + 32, 0, 2 * sizeof(q[0]));
}
if (eobs[3] > 1)
@@ -63,7 +63,7 @@ void vp8_dequant_idct_add_y_block_mmx
{
vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
dst+12, stride);
- vpx_memset(q + 48, 0, 2 * sizeof(q[0]));
+ memset(q + 48, 0, 2 * sizeof(q[0]));
}
q += 64;
@@ -85,7 +85,7 @@ void vp8_dequant_idct_add_uv_block_mmx
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
- vpx_memset(q, 0, 2 * sizeof(q[0]));
+ memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
@@ -94,7 +94,7 @@ void vp8_dequant_idct_add_uv_block_mmx
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
dstu+4, stride);
- vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+ memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
@@ -109,7 +109,7 @@ void vp8_dequant_idct_add_uv_block_mmx
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
- vpx_memset(q, 0, 2 * sizeof(q[0]));
+ memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
@@ -118,7 +118,7 @@ void vp8_dequant_idct_add_uv_block_mmx
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
dstv+4, stride);
- vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+ memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm
deleted file mode 100644
index 8d86abc0758..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm
+++ /dev/null
@@ -1,410 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp8_sad16x16_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp8_sad16x16_wmt) PRIVATE
-sym(vp8_sad16x16_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 6
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor xmm6, xmm6
-
-.x16x16sad_wmt_loop:
-
- movq xmm0, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rsi+8]
-
- movq xmm1, QWORD PTR [rdi]
- movq xmm3, QWORD PTR [rdi+8]
-
- movq xmm4, QWORD PTR [rsi+rax]
- movq xmm5, QWORD PTR [rdi+rdx]
-
-
- punpcklbw xmm0, xmm2
- punpcklbw xmm1, xmm3
-
- psadbw xmm0, xmm1
- movq xmm2, QWORD PTR [rsi+rax+8]
-
- movq xmm3, QWORD PTR [rdi+rdx+8]
- lea rsi, [rsi+rax*2]
-
- lea rdi, [rdi+rdx*2]
- punpcklbw xmm4, xmm2
-
- punpcklbw xmm5, xmm3
- psadbw xmm4, xmm5
-
- paddw xmm6, xmm0
- paddw xmm6, xmm4
-
- cmp rsi, rcx
- jne .x16x16sad_wmt_loop
-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movq rax, xmm0
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;unsigned int vp8_sad8x16_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int max_sad)
-global sym(vp8_sad8x16_wmt) PRIVATE
-sym(vp8_sad8x16_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rbx*8]
-
- lea rcx, [rcx+rbx*8]
- pxor mm7, mm7
-
-.x8x16sad_wmt_loop:
-
- movq rax, mm7
- cmp eax, arg(4)
- ja .x8x16sad_wmt_early_exit
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, QWORD PTR [rsi+rbx]
- movq mm3, QWORD PTR [rdi+rdx]
-
- psadbw mm0, mm1
- psadbw mm2, mm3
-
- lea rsi, [rsi+rbx*2]
- lea rdi, [rdi+rdx*2]
-
- paddw mm7, mm0
- paddw mm7, mm2
-
- cmp rsi, rcx
- jne .x8x16sad_wmt_loop
-
- movq rax, mm7
-
-.x8x16sad_wmt_early_exit:
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_sad8x8_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp8_sad8x8_wmt) PRIVATE
-sym(vp8_sad8x8_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rbx*8]
- pxor mm7, mm7
-
-.x8x8sad_wmt_loop:
-
- movq rax, mm7
- cmp eax, arg(4)
- ja .x8x8sad_wmt_early_exit
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- psadbw mm0, mm1
- lea rsi, [rsi+rbx]
-
- add rdi, rdx
- paddw mm7, mm0
-
- cmp rsi, rcx
- jne .x8x8sad_wmt_loop
-
- movq rax, mm7
-.x8x8sad_wmt_early_exit:
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;unsigned int vp8_sad4x4_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp8_sad4x4_wmt) PRIVATE
-sym(vp8_sad4x4_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- psadbw mm0, mm1
- lea rsi, [rsi+rax*2]
-
- lea rdi, [rdi+rdx*2]
- movd mm4, DWORD PTR [rsi]
-
- movd mm5, DWORD PTR [rdi]
- movd mm6, DWORD PTR [rsi+rax]
-
- movd mm7, DWORD PTR [rdi+rdx]
- punpcklbw mm4, mm6
-
- punpcklbw mm5, mm7
- psadbw mm4, mm5
-
- paddw mm0, mm4
- movq rax, mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_sad16x8_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp8_sad16x8_wmt) PRIVATE
-sym(vp8_sad16x8_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rbx*8]
- pxor mm7, mm7
-
-.x16x8sad_wmt_loop:
-
- movq rax, mm7
- cmp eax, arg(4)
- ja .x16x8sad_wmt_early_exit
-
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
-
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
-
- movq mm4, QWORD PTR [rsi+rbx]
- movq mm5, QWORD PTR [rdi+rdx]
-
- psadbw mm0, mm1
- psadbw mm2, mm3
-
- movq mm1, QWORD PTR [rsi+rbx+8]
- movq mm3, QWORD PTR [rdi+rdx+8]
-
- psadbw mm4, mm5
- psadbw mm1, mm3
-
- lea rsi, [rsi+rbx*2]
- lea rdi, [rdi+rdx*2]
-
- paddw mm0, mm2
- paddw mm4, mm1
-
- paddw mm7, mm0
- paddw mm7, mm4
-
- cmp rsi, rcx
- jne .x16x8sad_wmt_loop
-
- movq rax, mm7
-
-.x16x8sad_wmt_early_exit:
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_copy32xn_sse2(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp8_copy32xn_sse2) PRIVATE
-sym(vp8_copy32xn_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;dst_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;dst_stride
- movsxd rcx, dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- movdqu xmm2, XMMWORD PTR [rsi + rax]
- movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
-
- lea rsi, [rsi+rax*2]
-
- movdqu xmm4, XMMWORD PTR [rsi]
- movdqu xmm5, XMMWORD PTR [rsi + 16]
- movdqu xmm6, XMMWORD PTR [rsi + rax]
- movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
-
- lea rsi, [rsi+rax*2]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- movdqa XMMWORD PTR [rdi + rdx], xmm2
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
-
- lea rdi, [rdi+rdx*2]
-
- movdqa XMMWORD PTR [rdi], xmm4
- movdqa XMMWORD PTR [rdi + 16], xmm5
- movdqa XMMWORD PTR [rdi + rdx], xmm6
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
-
- lea rdi, [rdi+rdx*2]
-
- sub rcx, 4
- cmp rcx, 4
- jge .block_copy_sse2_loopx4
-
- cmp rcx, 0
- je .copy_is_done
-
-.block_copy_sse2_loop:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- lea rsi, [rsi+rax]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- lea rdi, [rdi+rdx]
-
- sub rcx, 1
- jne .block_copy_sse2_loop
-
-.copy_is_done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm
deleted file mode 100644
index 69c8d376973..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm
+++ /dev/null
@@ -1,960 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define ref_ptr rdi
- %define ref_stride rdx
- %define end_ptr rcx
- %define ret_var rbx
- %define result_ptr arg(4)
- %define max_sad arg(4)
- %define height dword ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- mov rsi, arg(0) ; src_ptr
- mov rdi, arg(2) ; ref_ptr
-
- movsxd rax, dword ptr arg(1) ; src_stride
- movsxd rdx, dword ptr arg(3) ; ref_stride
-%else
- %if LIBVPX_YASM_WIN64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define ref_ptr r8
- %define ref_stride r9
- %define end_ptr r10
- %define ret_var r11
- %define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define max_sad [rsp+xmm_stack_space+8+4*8]
- %define height dword ptr [rsp+xmm_stack_space+8+4*8]
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define ref_ptr rdx
- %define ref_stride rcx
- %define end_ptr r9
- %define ret_var r10
- %define result_ptr r8
- %define max_sad r8
- %define height r8
- %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
- %define src_ptr
- %define src_stride
- %define ref_ptr
- %define ref_stride
- %define end_ptr
- %define ret_var
- %define result_ptr
- %define max_sad
- %define height
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %if LIBVPX_YASM_WIN64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define r0_ptr rcx
- %define r1_ptr rdx
- %define r2_ptr rbx
- %define r3_ptr rdi
- %define ref_stride rbp
- %define result_ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
-
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
- mov rsi, arg(0) ; src_ptr
-
- movsxd rbx, dword ptr arg(1) ; src_stride
- movsxd rbp, dword ptr arg(3) ; ref_stride
-
- xchg rbx, rax
-%else
- %if LIBVPX_YASM_WIN64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define r0_ptr rsi
- %define r1_ptr r10
- %define r2_ptr r11
- %define r3_ptr r8
- %define ref_stride r9
- %define result_ptr [rsp+xmm_stack_space+16+4*8]
- push rsi
-
- LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define r0_ptr r9
- %define r1_ptr r10
- %define r2_ptr r11
- %define r3_ptr rdx
- %define ref_stride rcx
- %define result_ptr r8
-
- LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
- %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
- %define src_ptr
- %define src_stride
- %define r0_ptr
- %define r1_ptr
- %define r2_ptr
- %define r3_ptr
- %define ref_stride
- %define result_ptr
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %if LIBVPX_YASM_WIN64
- pop rsi
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm5, XMMWORD PTR [%3]
- lddqu xmm6, XMMWORD PTR [%3+1]
- lddqu xmm7, XMMWORD PTR [%3+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%3+1]
- lddqu xmm3, XMMWORD PTR [%3+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%4]
- lddqu xmm1, XMMWORD PTR [%3+%5]
- lddqu xmm2, XMMWORD PTR [%3+%5+1]
- lddqu xmm3, XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm5, QWORD PTR [%3]
- movq mm6, QWORD PTR [%3+1]
- movq mm7, QWORD PTR [%3+2]
-
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%3+1]
- movq mm3, QWORD PTR [%3+2]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endif
- movq mm0, QWORD PTR [%2+%4]
- movq mm1, QWORD PTR [%3+%5]
- movq mm2, QWORD PTR [%3+%5+1]
- movq mm3, QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endmacro
-
-%macro LOAD_X4_ADDRESSES 5
- mov %2, [%1+REG_SZ_BYTES*0]
- mov %3, [%1+REG_SZ_BYTES*1]
-
- mov %4, [%1+REG_SZ_BYTES*2]
- mov %5, [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm4, XMMWORD PTR [%3]
- lddqu xmm5, XMMWORD PTR [%4]
- lddqu xmm6, XMMWORD PTR [%5]
- lddqu xmm7, XMMWORD PTR [%6]
-
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%4]
- lddqu xmm3, XMMWORD PTR [%5]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [%6]
- paddw xmm5, xmm2
- paddw xmm6, xmm3
-
- psadbw xmm1, xmm0
- paddw xmm7, xmm1
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%7]
- lddqu xmm1, XMMWORD PTR [%3+%8]
- lddqu xmm2, XMMWORD PTR [%4+%8]
- lddqu xmm3, XMMWORD PTR [%5+%8]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [%6+%8]
- paddw xmm5, xmm2
- paddw xmm6, xmm3
-
-%if %1==0 || %1==1
- lea %2, [%2+%7*2]
- lea %3, [%3+%8*2]
-
- lea %4, [%4+%8*2]
- lea %5, [%5+%8*2]
-
- lea %6, [%6+%8*2]
-%endif
- psadbw xmm1, xmm0
- paddw xmm7, xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm4, QWORD PTR [%3]
- movq mm5, QWORD PTR [%4]
- movq mm6, QWORD PTR [%5]
- movq mm7, QWORD PTR [%6]
-
- psadbw mm4, mm0
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%4]
- movq mm3, QWORD PTR [%5]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm4, mm1
- movq mm1, QWORD PTR [%6]
- paddw mm5, mm2
- paddw mm6, mm3
-
- psadbw mm1, mm0
- paddw mm7, mm1
-%endif
- movq mm0, QWORD PTR [%2+%7]
- movq mm1, QWORD PTR [%3+%8]
- movq mm2, QWORD PTR [%4+%8]
- movq mm3, QWORD PTR [%5+%8]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm4, mm1
- movq mm1, QWORD PTR [%6+%8]
- paddw mm5, mm2
- paddw mm6, mm3
-
-%if %1==0 || %1==1
- lea %2, [%2+%7*2]
- lea %3, [%3+%8*2]
-
- lea %4, [%4+%8*2]
- lea %5, [%5+%8*2]
-
- lea %6, [%6+%8*2]
-%endif
- psadbw mm1, mm0
- paddw mm7, mm1
-
-%endmacro
-
-;void int vp8_sad16x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad16x16x3_sse3) PRIVATE
-sym(vp8_sad16x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad16x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad16x8x3_sse3) PRIVATE
-sym(vp8_sad16x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad8x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad8x16x3_sse3) PRIVATE
-sym(vp8_sad8x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad8x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad8x8x3_sse3) PRIVATE
-sym(vp8_sad8x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad4x4x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad4x4x3_sse3) PRIVATE
-sym(vp8_sad4x4x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [ref_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [ref_ptr+1]
- movd mm5, DWORD PTR [ref_ptr+2]
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- psadbw mm4, mm0
- psadbw mm5, mm0
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [ref_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm6, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm6
-
- movd mm3, DWORD PTR [ref_ptr+1]
- movd mm7, DWORD PTR [ref_ptr+2]
-
- psadbw mm2, mm0
-
- paddw mm1, mm2
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm6
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- mov rcx, result_ptr
-
- punpckldq mm1, mm3
-
- movq [rcx], mm1
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;unsigned int vp8_sad16x16_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int max_sad)
-;%define lddqu movdqu
-global sym(vp8_sad16x16_sse3) PRIVATE
-sym(vp8_sad16x16_sse3):
-
- STACK_FRAME_CREATE_X3
-
- mov end_ptr, 4
- pxor xmm7, xmm7
-
-.vp8_sad16x16_sse3_loop:
- movdqa xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [ref_ptr]
- movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
- movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movdqa xmm4, XMMWORD PTR [src_ptr]
- movdqu xmm5, XMMWORD PTR [ref_ptr]
- movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
-
- psadbw xmm0, xmm1
-
- movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
-
- psadbw xmm2, xmm3
- psadbw xmm4, xmm5
- psadbw xmm6, xmm1
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- paddw xmm7, xmm0
- paddw xmm7, xmm2
- paddw xmm7, xmm4
- paddw xmm7, xmm6
-
- sub end_ptr, 1
- jne .vp8_sad16x16_sse3_loop
-
- movq xmm0, xmm7
- psrldq xmm7, 8
- paddw xmm0, xmm7
- movq rax, xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void vp8_copy32xn_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp8_copy32xn_sse3) PRIVATE
-sym(vp8_copy32xn_sse3):
-
- STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
- lea end_ptr, [src_ptr+src_stride*2]
-
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
- movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
- movdqu xmm4, XMMWORD PTR [end_ptr]
- movdqu xmm5, XMMWORD PTR [end_ptr + 16]
- movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
- movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
-
- lea src_ptr, [src_ptr+src_stride*4]
-
- lea end_ptr, [ref_ptr+ref_stride*2]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
- movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
- movdqa XMMWORD PTR [end_ptr], xmm4
- movdqa XMMWORD PTR [end_ptr + 16], xmm5
- movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
- movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
- lea ref_ptr, [ref_ptr+ref_stride*4]
-
- sub height, 4
- cmp height, 4
- jge .block_copy_sse3_loopx4
-
- ;Check to see if there is more rows need to be copied.
- cmp height, 0
- je .copy_is_done
-
-.block_copy_sse3_loop:
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- lea src_ptr, [src_ptr+src_stride]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- lea ref_ptr, [ref_ptr+ref_stride]
-
- sub height, 1
- jne .block_copy_sse3_loop
-
-.copy_is_done:
- STACK_FRAME_DESTROY_X3
-
-;void vp8_sad16x16x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad16x16x4d_sse3) PRIVATE
-sym(vp8_sad16x16x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- movq xmm0, xmm4
- psrldq xmm4, 8
-
- paddw xmm0, xmm4
- movd [rcx], xmm0
-;-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+8], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+12], xmm0
-
- STACK_FRAME_DESTROY_X4
-
-;void vp8_sad16x8x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad16x8x4d_sse3) PRIVATE
-sym(vp8_sad16x8x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- movq xmm0, xmm4
- psrldq xmm4, 8
-
- paddw xmm0, xmm4
- movd [rcx], xmm0
-;-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+8], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+12], xmm0
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad8x16x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad8x16x4d_sse3) PRIVATE
-sym(vp8_sad8x16x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- punpckldq mm4, mm5
- punpckldq mm6, mm7
-
- movq [rcx], mm4
- movq [rcx+8], mm6
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad8x8x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad8x8x4d_sse3) PRIVATE
-sym(vp8_sad8x8x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- punpckldq mm4, mm5
- punpckldq mm6, mm7
-
- movq [rcx], mm4
- movq [rcx+8], mm6
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad4x4x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp8_sad4x4x4d_sse3) PRIVATE
-sym(vp8_sad4x4x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [r0_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [r0_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [r1_ptr]
- movd mm5, DWORD PTR [r2_ptr]
-
- movd mm6, DWORD PTR [r3_ptr]
- movd mm2, DWORD PTR [r1_ptr+ref_stride]
-
- movd mm3, DWORD PTR [r2_ptr+ref_stride]
- movd mm7, DWORD PTR [r3_ptr+ref_stride]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- punpcklbw mm6, mm7
- psadbw mm4, mm0
-
- psadbw mm5, mm0
- psadbw mm6, mm0
-
-
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea r0_ptr, [r0_ptr+ref_stride*2]
-
- lea r1_ptr, [r1_ptr+ref_stride*2]
- lea r2_ptr, [r2_ptr+ref_stride*2]
-
- lea r3_ptr, [r3_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [r0_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm7, DWORD PTR [r0_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm7
-
- movd mm3, DWORD PTR [r1_ptr]
- movd mm7, DWORD PTR [r2_ptr]
-
- psadbw mm2, mm0
-%if ABI_IS_32BIT
- mov rax, rbp
-
- pop rbp
-%define ref_stride rax
-%endif
- mov rsi, result_ptr
-
- paddw mm1, mm2
- movd [rsi], mm1
-
- movd mm2, DWORD PTR [r1_ptr+ref_stride]
- movd mm1, DWORD PTR [r2_ptr+ref_stride]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm1
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- movd mm2, DWORD PTR [r3_ptr]
- movd mm1, DWORD PTR [r3_ptr+ref_stride]
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- movd [rsi+4], mm3
- punpcklbw mm2, mm1
-
- movd [rsi+8], mm7
- psadbw mm2, mm0
-
- paddw mm2, mm6
- movd [rsi+12], mm2
-
-
- STACK_FRAME_DESTROY_X4
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm
deleted file mode 100644
index f7fccd77c58..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm
+++ /dev/null
@@ -1,353 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm1, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm1, xmm2
- paddw xmm1, xmm3
- paddw xmm1, xmm4
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endif
- movdqa xmm0, XMMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- movq xmm2, MMWORD PTR [rdi+ rdx+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
- movq xmm0, MMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm1, xmm2
-%else
- movq xmm0, MMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endif
- movq xmm0, MMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
- movd xmm0, [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- mpsadbw xmm1, xmm0, 0x0
-%else
- movd xmm0, [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endif
- movd xmm0, [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endmacro
-
-
-;void vp8_sad16x16x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array);
-global sym(vp8_sad16x16x8_sse4) PRIVATE
-sym(vp8_sad16x16x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_sad16x8x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp8_sad16x8x8_sse4) PRIVATE
-sym(vp8_sad16x8x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_sad8x8x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp8_sad8x8x8_sse4) PRIVATE
-sym(vp8_sad8x8x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_sad8x16x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp8_sad8x16x8_sse4) PRIVATE
-sym(vp8_sad8x16x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_sad4x4x8_c(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp8_sad4x4x8_sse4) PRIVATE
-sym(vp8_sad4x4x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_4X2X8 1
- PROCESS_4X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c
index b4092938161..fb0b57eb1c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -127,7 +127,7 @@ void vp8_sixtap_predict4x4_mmx
int dst_pitch
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -148,7 +148,7 @@ void vp8_sixtap_predict16x16_mmx
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
@@ -180,7 +180,7 @@ void vp8_sixtap_predict8x8_mmx
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
@@ -206,7 +206,7 @@ void vp8_sixtap_predict8x4_mmx
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
@@ -252,7 +252,7 @@ void vp8_sixtap_predict16x16_sse2
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
@@ -292,7 +292,7 @@ void vp8_sixtap_predict8x8_sse2
int dst_pitch
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
if (xoffset)
@@ -330,7 +330,7 @@ void vp8_sixtap_predict8x4_sse2
int dst_pitch
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
if (xoffset)
@@ -432,7 +432,7 @@ void vp8_sixtap_predict16x16_ssse3
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
+ DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
if (xoffset)
{
@@ -480,7 +480,7 @@ void vp8_sixtap_predict8x8_ssse3
int dst_pitch
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+ DECLARE_ALIGNED(16, unsigned char, FData2[256]);
if (xoffset)
{
@@ -528,7 +528,7 @@ void vp8_sixtap_predict8x4_ssse3
int dst_pitch
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+ DECLARE_ALIGNED(16, unsigned char, FData2[256]);
if (xoffset)
{
@@ -576,7 +576,7 @@ void vp8_sixtap_predict4x4_ssse3
int dst_pitch
)
{
- DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
+ DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
if (xoffset)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
index e7cf0d9b9c6..fb300fe8827 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
@@ -101,6 +101,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
int i;
#if CONFIG_ERROR_CONCEALMENT
int corruption_detected = 0;
+#else
+ (void)mb_idx;
#endif
if (xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -140,7 +142,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
* Better to use the predictor as reconstruction.
*/
pbi->frame_corrupt_residual = 1;
- vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+ memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
vp8_conceal_corrupt_mb(xd);
@@ -149,7 +151,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* force idct to be skipped for B_PRED and use the
* prediction only for reconstruction
* */
- vpx_memset(xd->eobs, 0, 25);
+ memset(xd->eobs, 0, 25);
}
}
#endif
@@ -182,7 +184,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* clear out residual eob info */
if(xd->mode_info_context->mbmi.mb_skip_coeff)
- vpx_memset(xd->eobs, 0, 25);
+ memset(xd->eobs, 0, 25);
intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
@@ -212,7 +214,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
(b->qcoeff[0] * DQC[0],
dst, dst_stride,
dst, dst_stride);
- vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
}
}
@@ -249,14 +251,14 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
vp8_short_inv_walsh4x4(&b->dqcoeff[0],
xd->qcoeff);
- vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+ memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
}
else
{
b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
xd->qcoeff);
- vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
/* override the dc dequant constant in order to preserve the
@@ -321,7 +323,7 @@ static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
for (i = 0; i < (int)Border; i++)
{
- vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
dest_ptr1 += plane_stride;
}
@@ -336,7 +338,7 @@ static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
for (i = 0; i < (int)(Border); i++)
{
- vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
dest_ptr1 += plane_stride;
}
@@ -349,7 +351,7 @@ static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
for (i = 0; i < (int)(Border); i++)
{
- vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ memcpy(dest_ptr1, src_ptr1, plane_stride);
dest_ptr1 += plane_stride;
}
}
@@ -377,7 +379,7 @@ static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
for (i = 0; i < (int)Border; i++)
{
- vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
dest_ptr2 += plane_stride;
}
@@ -395,7 +397,7 @@ static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
for (i = 0; i < (int)(Border); i++)
{
- vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
dest_ptr2 += plane_stride;
}
@@ -409,7 +411,7 @@ static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
for (i = 0; i < (int)(Border); i++)
{
- vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ memcpy(dest_ptr2, src_ptr2, plane_stride);
dest_ptr2 += plane_stride;
}
}
@@ -444,8 +446,8 @@ static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
for (i = 0; i < plane_height; i++)
{
- vpx_memset(dest_ptr1, src_ptr1[0], Border);
- vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
src_ptr1 += plane_stride;
src_ptr2 += plane_stride;
dest_ptr1 += plane_stride;
@@ -468,8 +470,8 @@ static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
for (i = 0; i < plane_height; i++)
{
- vpx_memset(dest_ptr1, src_ptr1[0], Border);
- vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
src_ptr1 += plane_stride;
src_ptr2 += plane_stride;
dest_ptr1 += plane_stride;
@@ -488,8 +490,8 @@ static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
for (i = 0; i < plane_height; i++)
{
- vpx_memset(dest_ptr1, src_ptr1[0], Border);
- vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ memset(dest_ptr1, src_ptr1[0], Border);
+ memset(dest_ptr2, src_ptr2[0], Border);
src_ptr1 += plane_stride;
src_ptr2 += plane_stride;
dest_ptr1 += plane_stride;
@@ -566,7 +568,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)
/* reset contexts */
xd->above_context = pc->above_context;
- vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
xd->left_available = 0;
@@ -916,19 +918,19 @@ static void init_frame(VP8D_COMP *pbi)
if (pc->frame_type == KEY_FRAME)
{
/* Various keyframe initializations */
- vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
vp8_init_mbmode_probs(pc);
vp8_default_coef_probs(pc);
/* reset the segment feature data to 0 with delta coding (Default state). */
- vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
/* reset the mode ref deltasa for loop filter */
- vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
- vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+ memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+ memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
/* All buffers are implicitly updated on key frames. */
pc->refresh_golden_frame = 1;
@@ -1067,12 +1069,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
pc->vert_scale = clear[6] >> 6;
}
data += 7;
- clear += 7;
}
else
{
- vpx_memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
- vpx_memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+ memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+ memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
}
}
if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
@@ -1104,7 +1105,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
{
xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
- vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
/* For each segmentation feature (Quant and loop filter level) */
for (i = 0; i < MB_LVL_MAX; i++)
@@ -1128,7 +1129,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
if (xd->update_mb_segmentation_map)
{
/* Which macro block level features are enabled */
- vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+ memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
/* Read the probs used to decode the segment id for each macro block. */
for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
@@ -1277,7 +1278,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
#endif
if (pc->refresh_entropy_probs == 0)
{
- vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+ memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
}
pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc);
@@ -1326,7 +1327,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
}
/* clear out the coeff buffer */
- vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+ memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
vp8_decode_mode_mvs(pbi);
@@ -1340,7 +1341,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
}
#endif
- vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+ memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
pbi->frame_corrupt_residual = 0;
#if CONFIG_MULTITHREAD
@@ -1379,7 +1380,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
if (pc->refresh_entropy_probs == 0)
{
- vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+ memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
pbi->independent_partitions = prev_independent_partitions;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c
index 35a22c7de5b..1d155e7e16d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c
@@ -591,6 +591,8 @@ static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
MB_MODE_INFO *mbmi)
{
+ (void)mbmi;
+
/* Read the Macroblock segmentation map if it is being updated explicitly
* this frame (reset to 0 above by default)
* By default on a key frame reset all MBs to segment 0
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c
index 452ff6cba3a..fcc7533c50f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c
@@ -20,8 +20,8 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
- vpx_memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
- vpx_memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
/* Clear entropy contexts for Y2 blocks */
if (!x->mode_info_context->mbmi.is_4x4)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
index 4b304c83c78..bb6d443c475 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
@@ -350,7 +350,7 @@ static void estimate_missing_mvs(MB_OVERLAP *overlaps,
unsigned int first_corrupt)
{
int mb_row, mb_col;
- vpx_memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
+ memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
/* First calculate the overlaps for all blocks */
for (mb_row = 0; mb_row < mb_rows; ++mb_row)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
index 1d763b6bfae..d7b8c76dc26 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
@@ -58,7 +58,7 @@ static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
if (!pbi)
return NULL;
- vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+ memset(pbi, 0, sizeof(VP8D_COMP));
if (setjmp(pbi->common.error.jmp))
{
@@ -87,6 +87,7 @@ static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
pbi->ec_enabled = oxcf->error_concealment;
pbi->overlaps = NULL;
#else
+ (void)oxcf;
pbi->ec_enabled = 0;
#endif
/* Error concealment is activated after a key frame has been
@@ -303,6 +304,8 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
{
VP8_COMMON *cm = &pbi->common;
int retcode = -1;
+ (void)size;
+ (void)source;
pbi->common.error.error_code = VPX_CODEC_OK;
@@ -407,6 +410,7 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
#if CONFIG_POSTPROC
ret = vp8_post_proc_frame(&pbi->common, sd, flags);
#else
+ (void)flags;
if (pbi->common.frame_to_show)
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
index fe290cffec7..6801532f118 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
@@ -60,12 +60,12 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
mbd->segmentation_enabled = xd->segmentation_enabled;
mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
- vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+ memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
/*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
- vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+ memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
/*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
- vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+ memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
/*unsigned char mode_ref_lf_delta_enabled;
unsigned char mode_ref_lf_delta_update;*/
mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled;
@@ -73,10 +73,10 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
mbd->current_bc = &pbi->mbc[0];
- vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
- vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
- vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
- vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+ memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+ memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+ memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+ memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
mbd->fullpixel_mask = 0xffffffff;
@@ -96,6 +96,8 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
int i;
#if CONFIG_ERROR_CONCEALMENT
int corruption_detected = 0;
+#else
+ (void)mb_idx;
#endif
if (xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -135,7 +137,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
* Better to use the predictor as reconstruction.
*/
pbi->frame_corrupt_residual = 1;
- vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+ memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
vp8_conceal_corrupt_mb(xd);
@@ -144,7 +146,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* force idct to be skipped for B_PRED and use the
* prediction only for reconstruction
* */
- vpx_memset(xd->eobs, 0, 25);
+ memset(xd->eobs, 0, 25);
}
}
#endif
@@ -177,7 +179,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* clear out residual eob info */
if(xd->mode_info_context->mbmi.mb_skip_coeff)
- vpx_memset(xd->eobs, 0, 25);
+ memset(xd->eobs, 0, 25);
intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
@@ -227,7 +229,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
{
vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
dst, dst_stride, dst, dst_stride);
- vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
}
}
@@ -264,14 +266,14 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
vp8_short_inv_walsh4x4(&b->dqcoeff[0],
xd->qcoeff);
- vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+ memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
}
else
{
b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
xd->qcoeff);
- vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+ memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
/* override the dc dequant constant in order to preserve the
@@ -358,7 +360,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
/* reset contexts */
xd->above_context = pc->above_context;
- vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
xd->left_available = 0;
@@ -497,9 +499,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
if( mb_row != pc->mb_rows-1 )
{
/* Save decoded MB last row data for next-row decoding */
- vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
- vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
- vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+ memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+ memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+ memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
}
/* save left_col for next MB decoding */
@@ -874,23 +876,23 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
if (filter_level)
{
/* Set above_row buffer to 127 for decoding first MB row */
- vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
- vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
- vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+ memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+ memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+ memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
for (j=1; j<pc->mb_rows; j++)
{
- vpx_memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
- vpx_memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
- vpx_memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+ memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+ memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+ memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
}
/* Set left_col to 129 initially */
for (j=0; j<pc->mb_rows; j++)
{
- vpx_memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
- vpx_memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
- vpx_memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+ memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+ memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+ memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
}
/* Initialize the loop filter for this frame. */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
deleted file mode 100644
index 4abe818f188..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ /dev/null
@@ -1,310 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_start_encode|
- EXPORT |vp8_encode_bool|
- EXPORT |vp8_stop_encode|
- EXPORT |vp8_encode_value|
- IMPORT |vp8_validate_buffer_arm|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
- ; macro for validating write buffer position
- ; needs vp8_writer in r0
- ; start shall not be in r1
- MACRO
- VALIDATE_POS $start, $pos
- push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
- ldr r2, [r0, #vp8_writer_buffer_end]
- ldr r3, [r0, #vp8_writer_error]
- mov r1, $pos
- mov r0, $start
- bl vp8_validate_buffer_arm
- pop {r0-r3, r12, lr}
- MEND
-
-; r0 BOOL_CODER *br
-; r1 unsigned char *source
-; r2 unsigned char *source_end
-|vp8_start_encode| PROC
- str r2, [r0, #vp8_writer_buffer_end]
- mov r12, #0
- mov r3, #255
- mvn r2, #23
- str r12, [r0, #vp8_writer_lowvalue]
- str r3, [r0, #vp8_writer_range]
- str r2, [r0, #vp8_writer_count]
- str r12, [r0, #vp8_writer_pos]
- str r1, [r0, #vp8_writer_buffer]
- bx lr
- ENDP
-
-; r0 BOOL_CODER *br
-; r1 int bit
-; r2 int probability
-|vp8_encode_bool| PROC
- push {r4-r10, lr}
-
- mov r4, r2
-
- ldr r2, [r0, #vp8_writer_lowvalue]
- ldr r5, [r0, #vp8_writer_range]
- ldr r3, [r0, #vp8_writer_count]
-
- sub r7, r5, #1 ; range-1
-
- cmp r1, #0
- mul r6, r4, r7 ; ((range-1) * probability)
-
- mov r7, #1
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
-
- addne r2, r2, r4 ; if (bit) lowvalue += split
- subne r4, r5, r4 ; if (bit) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r9, #0
- strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r1, [r7, r4]
- cmpge r1, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r9, [r7, r4] ; w->buffer[x]
- add r9, r9, #1
- strb r9, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r9, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r1, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r1, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r9, r1 ; validate_buffer at pos
-
- strb r7, [r9, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- str r2, [r0, #vp8_writer_lowvalue]
- str r5, [r0, #vp8_writer_range]
- str r3, [r0, #vp8_writer_count]
- pop {r4-r10, pc}
- ENDP
-
-; r0 BOOL_CODER *br
-|vp8_stop_encode| PROC
- push {r4-r10, lr}
-
- ldr r2, [r0, #vp8_writer_lowvalue]
- ldr r5, [r0, #vp8_writer_range]
- ldr r3, [r0, #vp8_writer_count]
-
- mov r10, #32
-
-stop_encode_loop
- sub r7, r5, #1 ; range-1
-
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_se ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_se
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_se
-token_zero_while_loop_se
- mov r9, #0
- strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_se
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r1, [r7, r4]
- cmpge r1, #0xff
- beq token_zero_while_loop_se
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r9, [r7, r4] ; w->buffer[x]
- add r9, r9, #1
- strb r9, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_se
- rsb r4, r6, #24 ; 24-offset
- ldr r9, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r1, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r1, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r9, r1 ; validate_buffer at pos
-
- strb r7, [r9, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r10, r10, #1
- bne stop_encode_loop
-
- str r2, [r0, #vp8_writer_lowvalue]
- str r5, [r0, #vp8_writer_range]
- str r3, [r0, #vp8_writer_count]
- pop {r4-r10, pc}
-
- ENDP
-
-; r0 BOOL_CODER *br
-; r1 int data
-; r2 int bits
-|vp8_encode_value| PROC
- push {r4-r12, lr}
-
- mov r10, r2
-
- ldr r2, [r0, #vp8_writer_lowvalue]
- ldr r5, [r0, #vp8_writer_range]
- ldr r3, [r0, #vp8_writer_count]
-
- rsb r4, r10, #32 ; 32-n
-
- ; v is kept in r1 during the token pack loop
- lsl r1, r1, r4 ; r1 = v << 32 - n
-
-encode_value_loop
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r1, r1, #1 ; bit = v >> n
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- addcs r2, r2, r4 ; if (bit) lowvalue += split
- subcs r4, r5, r4 ; if (bit) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_ev ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_ev
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_ev
-token_zero_while_loop_ev
- mov r9, #0
- strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_ev
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop_ev
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r9, [r7, r4] ; w->buffer[x]
- add r9, r9, #1
- strb r9, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_ev
- rsb r4, r6, #24 ; 24-offset
- ldr r9, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r9, r11 ; validate_buffer at pos
-
- strb r7, [r9, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_ev
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r10, r10, #1
- bne encode_value_loop
-
- str r2, [r0, #vp8_writer_lowvalue]
- str r5, [r0, #vp8_writer_range]
- str r3, [r0, #vp8_writer_count]
- pop {r4-r12, pc}
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
deleted file mode 100644
index 90a141c6248..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ /dev/null
@@ -1,317 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_tokens_armv5|
- IMPORT |vp8_validate_buffer_arm|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-
- ; macro for validating write buffer position
- ; needs vp8_writer in r0
- ; start shall not be in r1
- MACRO
- VALIDATE_POS $start, $pos
- push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
- ldr r2, [r0, #vp8_writer_buffer_end]
- ldr r3, [r0, #vp8_writer_error]
- mov r1, $pos
- mov r0, $start
- bl vp8_validate_buffer_arm
- pop {r0-r3, r12, lr}
- MEND
-
-
-; r0 vp8_writer *w
-; r1 const TOKENEXTRA *p
-; r2 int xcount
-; r3 vp8_coef_encodings
-; s0 vp8_extra_bits
-; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv5| PROC
- push {r4-r12, lr}
- sub sp, sp, #16
-
- ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
- ; sizeof (TOKENEXTRA) is 8
- add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
- str r2, [sp, #0]
- str r3, [sp, #8] ; save vp8_coef_encodings
- ldr r2, [r0, #vp8_writer_lowvalue]
- ldr r5, [r0, #vp8_writer_range]
- ldr r3, [r0, #vp8_writer_count]
- b check_p_lt_stop
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #8] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp8_token_value] ; v
- ldr r8, [r4, #vp8_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #60] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #60] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #56] ; vp8_extra_bits
- ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
- ; element. Here vp8_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp8_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp8_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp8_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp8_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp8_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp8_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp8_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #vp8_writer_pos]
-
- VALIDATE_POS r7, r12 ; validate_buffer at pos
-
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- str r2, [r0, #vp8_writer_lowvalue]
- str r5, [r0, #vp8_writer_range]
- str r3, [r0, #vp8_writer_count]
- add sp, sp, #16
- pop {r4-r12, pc}
- ENDP
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
deleted file mode 100644
index 3a8d17a81b1..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ /dev/null
@@ -1,352 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_mb_row_tokens_armv5|
- IMPORT |vp8_validate_buffer_arm|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-
- ; macro for validating write buffer position
- ; needs vp8_writer in r0
- ; start shall not be in r1
- MACRO
- VALIDATE_POS $start, $pos
- push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
- ldr r2, [r0, #vp8_writer_buffer_end]
- ldr r3, [r0, #vp8_writer_error]
- mov r1, $pos
- mov r0, $start
- bl vp8_validate_buffer_arm
- pop {r0-r3, r12, lr}
- MEND
-
-; r0 VP8_COMP *cpi
-; r1 vp8_writer *w
-; r2 vp8_coef_encodings
-; r3 vp8_extra_bits
-; s0 vp8_coef_tree
-
-|vp8cx_pack_mb_row_tokens_armv5| PROC
- push {r4-r12, lr}
- sub sp, sp, #24
-
- ; Compute address of cpi->common.mb_rows
- ldr r4, _VP8_COMP_common_
- ldr r6, _VP8_COMMON_MBrows_
- add r4, r0, r4
-
- ldr r5, [r4, r6] ; load up mb_rows
-
- str r2, [sp, #20] ; save vp8_coef_encodings
- str r5, [sp, #12] ; save mb_rows
- str r3, [sp, #8] ; save vp8_extra_bits
-
- ldr r4, _VP8_COMP_tplist_
- add r4, r0, r4
- ldr r7, [r4, #0] ; dereference cpi->tp_list
-
- mov r0, r1 ; keep same as other loops
-
- ldr r2, [r0, #vp8_writer_lowvalue]
- ldr r5, [r0, #vp8_writer_range]
- ldr r3, [r0, #vp8_writer_count]
-
-mb_row_loop
-
- ldr r1, [r7, #tokenlist_start]
- ldr r9, [r7, #tokenlist_stop]
- str r9, [sp, #0] ; save stop for later comparison
- str r7, [sp, #16] ; tokenlist address for next time
-
- b check_p_lt_stop
-
- ; actuall work gets done here!
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #20] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp8_token_value] ; v
- ldr r8, [r4, #vp8_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #64] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #64] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #8] ; vp8_extra_bits
- ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
- ; element. Here vp8_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp8_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp8_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp8_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp8_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp8_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp8_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp8_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #vp8_writer_pos]
-
- VALIDATE_POS r7, r12 ; validate_buffer at pos
-
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- ldr r6, [sp, #12] ; mb_rows
- ldr r7, [sp, #16] ; tokenlist address
- subs r6, r6, #1
- add r7, r7, #TOKENLIST_SZ ; next element in the array
- str r6, [sp, #12]
- bne mb_row_loop
-
- str r2, [r0, #vp8_writer_lowvalue]
- str r5, [r0, #vp8_writer_range]
- str r3, [r0, #vp8_writer_count]
- add sp, sp, #24
- pop {r4-r12, pc}
- ENDP
-
-_VP8_COMP_common_
- DCD vp8_comp_common
-_VP8_COMMON_MBrows_
- DCD vp8_common_mb_rows
-_VP8_COMP_tplist_
- DCD vp8_comp_tplist
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
deleted file mode 100644
index e9aa4958f30..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ /dev/null
@@ -1,471 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
- IMPORT |vp8_validate_buffer_arm|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
- ; macro for validating write buffer position
- ; needs vp8_writer in r0
- ; start shall not be in r1
- MACRO
- VALIDATE_POS $start, $pos
- push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
- ldr r2, [r0, #vp8_writer_buffer_end]
- ldr r3, [r0, #vp8_writer_error]
- mov r1, $pos
- mov r0, $start
- bl vp8_validate_buffer_arm
- pop {r0-r3, r12, lr}
- MEND
-
-; r0 VP8_COMP *cpi
-; r1 unsigned char *cx_data
-; r2 const unsigned char *cx_data_end
-; r3 int num_part
-; s0 vp8_coef_encodings
-; s1 vp8_extra_bits,
-; s2 const vp8_tree_index *
-
-|vp8cx_pack_tokens_into_partitions_armv5| PROC
- push {r4-r12, lr}
- sub sp, sp, #40
-
- ; Compute address of cpi->common.mb_rows
- ldr r4, _VP8_COMP_common_
- ldr r6, _VP8_COMMON_MBrows_
- add r4, r0, r4
-
- ldr r5, [r4, r6] ; load up mb_rows
-
- str r5, [sp, #36] ; save mb_rows
- str r1, [sp, #24] ; save ptr = cx_data
- str r3, [sp, #20] ; save num_part
- str r2, [sp, #8] ; save cx_data_end
-
- ldr r4, _VP8_COMP_tplist_
- add r4, r0, r4
- ldr r7, [r4, #0] ; dereference cpi->tp_list
- str r7, [sp, #32] ; store start of cpi->tp_list
-
- ldr r11, _VP8_COMP_bc_ ; load up vp8_writer out of cpi
- add r0, r0, r11
-
- mov r11, #0
- str r11, [sp, #28] ; i
-
-numparts_loop
- ldr r2, _vp8_writer_sz_ ; load up sizeof(vp8_writer)
- add r0, r2 ; bc[i + 1]
-
- ldr r10, [sp, #24] ; ptr
- ldr r5, [sp, #36] ; move mb_rows to the counting section
- subs r5, r5, r11 ; move start point with each partition
- ; mb_rows starts at i
- str r5, [sp, #12]
-
- ; Reset all of the VP8 Writer data for each partition that
- ; is processed.
- ; start_encode
-
- ldr r3, [sp, #8]
- str r3, [r0, #vp8_writer_buffer_end]
-
- mov r2, #0 ; vp8_writer_lowvalue
- mov r5, #255 ; vp8_writer_range
- mvn r3, #23 ; vp8_writer_count
-
- str r2, [r0, #vp8_writer_pos]
- str r10, [r0, #vp8_writer_buffer]
-
- ble end_partition ; if (mb_rows <= 0) end partition
-
-mb_row_loop
-
- ldr r1, [r7, #tokenlist_start]
- ldr r9, [r7, #tokenlist_stop]
- str r9, [sp, #0] ; save stop for later comparison
- str r7, [sp, #16] ; tokenlist address for next time
-
- b check_p_lt_stop
-
- ; actual work gets done here!
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #80] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp8_token_value] ; v
- ldr r8, [r4, #vp8_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #88] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #88] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #84] ; vp8_extra_bits
- ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
- ; element. Here vp8_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp8_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp8_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp8_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp8_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp8_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp8_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp8_writer_pos]
- mvn r3, #7 ; count = -8
- ldr r7, [r0, #vp8_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #vp8_writer_pos]
-
- VALIDATE_POS r7, r12 ; validate_buffer at pos
-
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- ldr r10, [sp, #20] ; num_parts
- mov r1, #TOKENLIST_SZ
- mul r1, r10, r1
-
- ldr r6, [sp, #12] ; mb_rows
- ldr r7, [sp, #16] ; tokenlist address
- subs r6, r6, r10
- add r7, r7, r1 ; next element in the array
- str r6, [sp, #12]
- bgt mb_row_loop
-
-end_partition
- mov r12, #32
-
-stop_encode_loop
- sub r7, r5, #1 ; range-1
-
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_se ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_se
-
- ldr r4, [r0, #vp8_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_se
-token_zero_while_loop_se
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_se
- cmp r4, #0
- ldrge r7, [r0, #vp8_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop_se
-
- ldr r7, [r0, #vp8_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_se
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp8_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp8_writer_pos]
- sub r3, r3, #8 ; count -= 8
-
- VALIDATE_POS r10, r11 ; validate_buffer at pos
-
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r12, r12, #1
- bne stop_encode_loop
-
- ldr r4, [r0, #vp8_writer_pos] ; w->pos
- ldr r12, [sp, #24] ; ptr
- add r12, r12, r4 ; ptr += w->pos
- str r12, [sp, #24]
-
- ldr r11, [sp, #28] ; i
- ldr r10, [sp, #20] ; num_parts
-
- add r11, r11, #1 ; i++
- str r11, [sp, #28]
-
- ldr r7, [sp, #32] ; cpi->tp_list[i]
- mov r1, #TOKENLIST_SZ
- add r7, r7, r1 ; next element in cpi->tp_list
- str r7, [sp, #32] ; cpi->tp_list[i+1]
-
- cmp r10, r11
- bgt numparts_loop
-
- add sp, sp, #40
- pop {r4-r12, pc}
- ENDP
-
-_VP8_COMP_common_
- DCD vp8_comp_common
-_VP8_COMMON_MBrows_
- DCD vp8_common_mb_rows
-_VP8_COMP_tplist_
- DCD vp8_comp_tplist
-_VP8_COMP_bc_
- DCD vp8_comp_bc
-_vp8_writer_sz_
- DCD vp8_writer_sz
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
deleted file mode 100644
index de35a1e13ca..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_quantize_b_armv6|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 BLOCK *b
-; r1 BLOCKD *d
-|vp8_fast_quantize_b_armv6| PROC
- stmfd sp!, {r1, r4-r11, lr}
-
- ldr r3, [r0, #vp8_block_coeff] ; coeff
- ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
- ldr r5, [r0, #vp8_block_round] ; round
- ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
- ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
- ldr r8, [r1, #vp8_blockd_dequant] ; dequant
-
- ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
- ; is used to update the counter so that
- ; it can be used to mark nonzero
- ; quantized coefficient pairs.
-
- mov r1, #0 ; flags for quantized coeffs
-
- ; PART 1: quantization and dequantization loop
-loop
- ldr r9, [r3], #4 ; [z1 | z0]
- ldr r10, [r5], #4 ; [r1 | r0]
- ldr r11, [r4], #4 ; [q1 | q0]
-
- ssat16 lr, #1, r9 ; [sz1 | sz0]
- eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
- ssub16 r9, r9, lr ; x = (z ^ sz) - sz
- sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
-
- ldr r12, [r3], #4 ; [z3 | z2]
-
- smulbb r0, r9, r11 ; [(x0+r0)*q0]
- smultt r9, r9, r11 ; [(x1+r1)*q1]
-
- ldr r10, [r5], #4 ; [r3 | r2]
-
- ssat16 r11, #1, r12 ; [sz3 | sz2]
- eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
- pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
- ldr r9, [r4], #4 ; [q3 | q2]
- ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
-
- sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
-
- eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
-
- smulbb r10, r12, r9 ; [(x2+r2)*q2]
- smultt r12, r12, r9 ; [(x3+r3)*q3]
-
- ssub16 r0, r0, lr ; x = (y ^ sz) - sz
-
- cmp r0, #0 ; check if zero
- orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
-
- str r0, [r6], #4 ; *qcoeff++ = x
- ldr r9, [r8], #4 ; [dq1 | dq0]
-
- pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
- eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
- ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
-
- cmp r10, #0 ; check if zero
- orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
-
- str r10, [r6], #4 ; *qcoeff++ = x
- ldr r11, [r8], #4 ; [dq3 | dq2]
-
- smulbb r12, r0, r9 ; [x0*dq0]
- smultt r0, r0, r9 ; [x1*dq1]
-
- smulbb r9, r10, r11 ; [x2*dq2]
- smultt r10, r10, r11 ; [x3*dq3]
-
- lsls r2, r2, #2 ; update loop counter
- strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
- strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
- strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
- strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
- add r7, r7, #8 ; dqcoeff += 8
- bne loop
-
- ; PART 2: check position for eob...
- ldr r11, [sp, #0] ; restore BLOCKD pointer
- mov lr, #0 ; init eob
- cmp r1, #0 ; coeffs after quantization?
- ldr r12, [r11, #vp8_blockd_eob]
- beq end ; skip eob calculations if all zero
-
- ldr r0, [r11, #vp8_blockd_qcoeff]
-
- ; check shortcut for nonzero qcoeffs
- tst r1, #0x80
- bne quant_coeff_15_14
- tst r1, #0x20
- bne quant_coeff_13_11
- tst r1, #0x8
- bne quant_coeff_12_7
- tst r1, #0x40
- bne quant_coeff_10_9
- tst r1, #0x10
- bne quant_coeff_8_3
- tst r1, #0x2
- bne quant_coeff_6_5
- tst r1, #0x4
- bne quant_coeff_4_2
- b quant_coeff_1_0
-
-quant_coeff_15_14
- ldrh r2, [r0, #30] ; rc=15, i=15
- mov lr, #16
- cmp r2, #0
- bne end
-
- ldrh r3, [r0, #28] ; rc=14, i=14
- mov lr, #15
- cmp r3, #0
- bne end
-
-quant_coeff_13_11
- ldrh r2, [r0, #22] ; rc=11, i=13
- mov lr, #14
- cmp r2, #0
- bne end
-
-quant_coeff_12_7
- ldrh r3, [r0, #14] ; rc=7, i=12
- mov lr, #13
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #20] ; rc=10, i=11
- mov lr, #12
- cmp r2, #0
- bne end
-
-quant_coeff_10_9
- ldrh r3, [r0, #26] ; rc=13, i=10
- mov lr, #11
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #24] ; rc=12, i=9
- mov lr, #10
- cmp r2, #0
- bne end
-
-quant_coeff_8_3
- ldrh r3, [r0, #18] ; rc=9, i=8
- mov lr, #9
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #12] ; rc=6, i=7
- mov lr, #8
- cmp r2, #0
- bne end
-
-quant_coeff_6_5
- ldrh r3, [r0, #6] ; rc=3, i=6
- mov lr, #7
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #4] ; rc=2, i=5
- mov lr, #6
- cmp r2, #0
- bne end
-
-quant_coeff_4_2
- ldrh r3, [r0, #10] ; rc=5, i=4
- mov lr, #5
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #16] ; rc=8, i=3
- mov lr, #4
- cmp r2, #0
- bne end
-
- ldrh r3, [r0, #8] ; rc=4, i=2
- mov lr, #3
- cmp r3, #0
- bne end
-
-quant_coeff_1_0
- ldrh r2, [r0, #2] ; rc=1, i=1
- mov lr, #2
- cmp r2, #0
- bne end
-
- mov lr, #1 ; rc=0, i=0
-
-end
- strb lr, [r12]
- ldmfd sp!, {r1, r4-r11, pc}
-
- ENDP
-
-loop_count
- DCD 0x1000000
-
- END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
deleted file mode 100644
index 05746cf7fe9..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_subtract_mby_armv6|
- EXPORT |vp8_subtract_mbuv_armv6|
- EXPORT |vp8_subtract_b_armv6|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 BLOCK *be
-; r1 BLOCKD *bd
-; r2 int pitch
-|vp8_subtract_b_armv6| PROC
-
- stmfd sp!, {r4-r9}
-
- ldr r4, [r0, #vp8_block_base_src]
- ldr r5, [r0, #vp8_block_src]
- ldr r6, [r0, #vp8_block_src_diff]
-
- ldr r3, [r4]
- ldr r7, [r0, #vp8_block_src_stride]
- add r3, r3, r5 ; src = *base_src + src
- ldr r8, [r1, #vp8_blockd_predictor]
-
- mov r9, #4 ; loop count
-
-loop_block
-
- ldr r0, [r3], r7 ; src
- ldr r1, [r8], r2 ; pred
-
- uxtb16 r4, r0 ; [s2 | s0]
- uxtb16 r5, r1 ; [p2 | p0]
- uxtb16 r0, r0, ror #8 ; [s3 | s1]
- uxtb16 r1, r1, ror #8 ; [p3 | p1]
-
- usub16 r4, r4, r5 ; [d2 | d0]
- usub16 r5, r0, r1 ; [d3 | d1]
-
- subs r9, r9, #1 ; decrement loop counter
-
- pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
- pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
-
- str r0, [r6, #0] ; diff
- str r1, [r6, #4] ; diff
-
- add r6, r6, r2, lsl #1 ; update diff pointer
- bne loop_block
-
- ldmfd sp!, {r4-r9}
- mov pc, lr
-
- ENDP
-
-
-; r0 short *diff
-; r1 unsigned char *usrc
-; r2 unsigned char *vsrc
-; r3 int src_stride
-; sp unsigned char *upred
-; sp unsigned char *vpred
-; sp int pred_stride
-|vp8_subtract_mbuv_armv6| PROC
-
- stmfd sp!, {r4-r11}
-
- add r0, r0, #512 ; set *diff point to Cb
- mov r4, #8 ; loop count
- ldr r5, [sp, #32] ; upred
- ldr r12, [sp, #40] ; pred_stride
-
- ; Subtract U block
-loop_u
- ldr r6, [r1] ; usrc (A)
- ldr r7, [r5] ; upred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r1, #4] ; usrc (B)
- ldr r11, [r5, #4] ; upred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- add r1, r1, r3 ; update usrc pointer
- add r5, r5, r12 ; update upred pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (B)
-
- bne loop_u
-
- ldr r5, [sp, #36] ; vpred
- mov r4, #8 ; loop count
-
- ; Subtract V block
-loop_v
- ldr r6, [r2] ; vsrc (A)
- ldr r7, [r5] ; vpred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r2, #4] ; vsrc (B)
- ldr r11, [r5, #4] ; vpred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- add r2, r2, r3 ; update vsrc pointer
- add r5, r5, r12 ; update vpred pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (B)
-
- bne loop_v
-
- ldmfd sp!, {r4-r11}
- bx lr
-
- ENDP
-
-
-; r0 short *diff
-; r1 unsigned char *src
-; r2 int src_stride
-; r3 unsigned char *pred
-; sp int pred_stride
-|vp8_subtract_mby_armv6| PROC
-
- stmfd sp!, {r4-r11}
- ldr r12, [sp, #32] ; pred_stride
- mov r4, #16
-loop
- ldr r6, [r1] ; src (A)
- ldr r7, [r3] ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r1, #4] ; src (B)
- ldr r11, [r3, #4] ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- ldr r10, [r1, #8] ; src (C)
- ldr r11, [r3, #8] ; pred (C)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- uxtb16 r8, r10 ; [s2 | s0] (C)
- str r9, [r0], #4 ; diff (B)
-
- uxtb16 r9, r11 ; [p2 | p0] (C)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
-
- usub16 r6, r8, r9 ; [d2 | d0] (C)
- usub16 r7, r10, r11 ; [d3 | d1] (C)
-
- ldr r10, [r1, #12] ; src (D)
- ldr r11, [r3, #12] ; pred (D)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
-
- str r8, [r0], #4 ; diff (C)
- uxtb16 r8, r10 ; [s2 | s0] (D)
- str r9, [r0], #4 ; diff (C)
-
- uxtb16 r9, r11 ; [p2 | p0] (D)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
-
- usub16 r6, r8, r9 ; [d2 | d0] (D)
- usub16 r7, r10, r11 ; [d3 | d1] (D)
-
- add r1, r1, r2 ; update src pointer
- add r3, r3, r12 ; update pred pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
-
- str r8, [r0], #4 ; diff (D)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (D)
-
- bne loop
-
- ldmfd sp!, {r4-r11}
- bx lr
-
- ENDP
-
- END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c
deleted file mode 100644
index 17a941bfc65..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/boolhuff.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-const unsigned int vp8_prob_cost[256] =
-{
- 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
- 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
- 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
- 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
- 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
- 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
- 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
- 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
- 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
- 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
- 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
- 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
- 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
- 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
- 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
- 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
-};
-
-int vp8_validate_buffer_arm(const unsigned char *start,
- size_t len,
- const unsigned char *end,
- struct vpx_internal_error_info *error)
-{
- return validate_buffer(start, len, end, error);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
deleted file mode 100644
index 9374310e58f..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ /dev/null
@@ -1,258 +0,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_quantize_b_neon|
- EXPORT |vp8_fast_quantize_b_pair_neon|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=4
-
-;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
-|vp8_fast_quantize_b_pair_neon| PROC
-
- stmfd sp!, {r4-r9}
- vstmdb sp!, {q4-q7}
-
- ldr r4, [r0, #vp8_block_coeff]
- ldr r5, [r0, #vp8_block_quant_fast]
- ldr r6, [r0, #vp8_block_round]
-
- vld1.16 {q0, q1}, [r4@128] ; load z
-
- ldr r7, [r2, #vp8_blockd_qcoeff]
-
- vabs.s16 q4, q0 ; calculate x = abs(z)
- vabs.s16 q5, q1
-
- ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
- vshr.s16 q2, q0, #15 ; sz
- vshr.s16 q3, q1, #15
-
- vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
- vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
-
- ldr r4, [r1, #vp8_block_coeff]
-
- vadd.s16 q4, q6 ; x + Round
- vadd.s16 q5, q7
-
- vld1.16 {q0, q1}, [r4@128] ; load z2
-
- vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
- vqdmulh.s16 q5, q9
-
- vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
- vabs.s16 q11, q1
- vshr.s16 q12, q0, #15 ; sz2
- vshr.s16 q13, q1, #15
-
- ;modify data to have its original sign
- veor.s16 q4, q2 ; y^sz
- veor.s16 q5, q3
-
- vadd.s16 q10, q6 ; x2 + Round
- vadd.s16 q11, q7
-
- ldr r8, [r2, #vp8_blockd_dequant]
-
- vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
- vqdmulh.s16 q11, q9
-
- vshr.s16 q4, #1 ; right shift 1 after vqdmulh
- vshr.s16 q5, #1
-
- vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
-
- vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
- vsub.s16 q5, q3
-
- vshr.s16 q10, #1 ; right shift 1 after vqdmulh
- vshr.s16 q11, #1
-
- ldr r9, [r2, #vp8_blockd_dqcoeff]
-
- veor.s16 q10, q12 ; y2^sz2
- veor.s16 q11, q13
-
- vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
-
-
- vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
- vsub.s16 q11, q13
-
- ldr r6, [r3, #vp8_blockd_qcoeff]
-
- vmul.s16 q2, q6, q4 ; x * Dequant
- vmul.s16 q3, q7, q5
-
- adr r0, inv_zig_zag ; load ptr of inverse zigzag table
-
- vceq.s16 q8, q8 ; set q8 to all 1
-
- vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
-
- vmul.s16 q12, q6, q10 ; x2 * Dequant
- vmul.s16 q13, q7, q11
-
- vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
-
- vtst.16 q14, q4, q8 ; now find eob
- vtst.16 q15, q5, q8 ; non-zero element is set to all 1
-
- vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
-
- ldr r7, [r3, #vp8_blockd_dqcoeff]
-
- vand q0, q6, q14 ; get all valid numbers from scan array
- vand q1, q7, q15
-
- vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
-
- vtst.16 q2, q10, q8 ; now find eob
- vtst.16 q3, q11, q8 ; non-zero element is set to all 1
-
- vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
-
- vand q10, q6, q2 ; get all valid numbers from scan array
- vand q11, q7, q3
- vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
-
- vmax.u16 d0, d0, d1
- vmax.u16 d20, d20, d21
- vmovl.u16 q0, d0
- vmovl.u16 q10, d20
-
- vmax.u32 d0, d0, d1
- vmax.u32 d20, d20, d21
- vpmax.u32 d0, d0, d0
- vpmax.u32 d20, d20, d20
-
- ldr r4, [r2, #vp8_blockd_eob]
- ldr r5, [r3, #vp8_blockd_eob]
-
- vst1.8 {d0[0]}, [r4] ; store eob
- vst1.8 {d20[0]}, [r5] ; store eob
-
- vldmia sp!, {q4-q7}
- ldmfd sp!, {r4-r9}
- bx lr
-
- ENDP
-
-;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-|vp8_fast_quantize_b_neon| PROC
-
- stmfd sp!, {r4-r7}
-
- ldr r3, [r0, #vp8_block_coeff]
- ldr r4, [r0, #vp8_block_quant_fast]
- ldr r5, [r0, #vp8_block_round]
-
- vld1.16 {q0, q1}, [r3@128] ; load z
- vorr.s16 q14, q0, q1 ; check if all zero (step 1)
- ldr r6, [r1, #vp8_blockd_qcoeff]
- ldr r7, [r1, #vp8_blockd_dqcoeff]
- vorr.s16 d28, d28, d29 ; check if all zero (step 2)
-
- vabs.s16 q12, q0 ; calculate x = abs(z)
- vabs.s16 q13, q1
-
- ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
- vshr.s16 q2, q0, #15 ; sz
- vmov r2, r3, d28 ; check if all zero (step 3)
- vshr.s16 q3, q1, #15
-
- vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
- vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
-
- vadd.s16 q12, q14 ; x + Round
- vadd.s16 q13, q15
-
- adr r0, inv_zig_zag ; load ptr of inverse zigzag table
-
- vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
- vqdmulh.s16 q13, q9
-
- vld1.16 {q10, q11}, [r0@128]; load inverse scan order
-
- vceq.s16 q8, q8 ; set q8 to all 1
-
- ldr r4, [r1, #vp8_blockd_dequant]
-
- vshr.s16 q12, #1 ; right shift 1 after vqdmulh
- vshr.s16 q13, #1
-
- ldr r5, [r1, #vp8_blockd_eob]
-
- orr r2, r2, r3 ; check if all zero (step 4)
- cmp r2, #0 ; check if all zero (step 5)
- beq zero_output ; check if all zero (step 6)
-
- ;modify data to have its original sign
- veor.s16 q12, q2 ; y^sz
- veor.s16 q13, q3
-
- vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
- vsub.s16 q13, q3
-
- vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
-
- vtst.16 q14, q12, q8 ; now find eob
- vtst.16 q15, q13, q8 ; non-zero element is set to all 1
-
- vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
-
- vand q10, q10, q14 ; get all valid numbers from scan array
- vand q11, q11, q15
-
-
- vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
- vmax.u16 d0, d0, d1
- vmovl.u16 q0, d0
-
- vmul.s16 q2, q12 ; x * Dequant
- vmul.s16 q3, q13
-
- vmax.u32 d0, d0, d1
- vpmax.u32 d0, d0, d0
-
- vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
-
- vst1.8 {d0[0]}, [r5] ; store eob
-
- ldmfd sp!, {r4-r7}
- bx lr
-
-zero_output
- strb r2, [r5] ; store eob
- vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
- vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
-
- ldmfd sp!, {r4-r7}
- bx lr
-
- ENDP
-
-; default inverse zigzag table is defined in vp8/common/entropy.c
- ALIGN 16 ; enable use of @128 bit aligned loads
-inv_zig_zag
- DCW 0x0001, 0x0002, 0x0006, 0x0007
- DCW 0x0003, 0x0005, 0x0008, 0x000d
- DCW 0x0004, 0x0009, 0x000c, 0x000e
- DCW 0x000a, 0x000b, 0x000f, 0x0010
-
- END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
new file mode 100644
index 00000000000..e5824bfb217
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+static const uint16_t inv_zig_zag[16] = {
+ 1, 2, 6, 7,
+ 3, 5, 8, 13,
+ 4, 9, 12, 14,
+ 10, 11, 15, 16
+};
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
+ const int16x8_t one_q = vdupq_n_s16(-1),
+ z0 = vld1q_s16(b->coeff),
+ z1 = vld1q_s16(b->coeff + 8),
+ round0 = vld1q_s16(b->round),
+ round1 = vld1q_s16(b->round + 8),
+ quant0 = vld1q_s16(b->quant_fast),
+ quant1 = vld1q_s16(b->quant_fast + 8),
+ dequant0 = vld1q_s16(d->dequant),
+ dequant1 = vld1q_s16(d->dequant + 8);
+ const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
+ zig_zag1 = vld1q_u16(inv_zig_zag + 8);
+ int16x8_t x0, x1, sz0, sz1, y0, y1;
+ uint16x8_t eob0, eob1;
+ uint16x4_t eob_d16;
+ uint32x2_t eob_d32;
+ uint32x4_t eob_q32;
+
+ /* sign of z: z >> 15 */
+ sz0 = vshrq_n_s16(z0, 15);
+ sz1 = vshrq_n_s16(z1, 15);
+
+ /* x = abs(z) */
+ x0 = vabsq_s16(z0);
+ x1 = vabsq_s16(z1);
+
+ /* x += round */
+ x0 = vaddq_s16(x0, round0);
+ x1 = vaddq_s16(x1, round1);
+
+ /* y = 2 * (x * quant) >> 16 */
+ y0 = vqdmulhq_s16(x0, quant0);
+ y1 = vqdmulhq_s16(x1, quant1);
+
+ /* Compensate for doubling in vqdmulhq */
+ y0 = vshrq_n_s16(y0, 1);
+ y1 = vshrq_n_s16(y1, 1);
+
+ /* Restore sign bit */
+ y0 = veorq_s16(y0, sz0);
+ y1 = veorq_s16(y1, sz1);
+ x0 = vsubq_s16(y0, sz0);
+ x1 = vsubq_s16(y1, sz1);
+
+ /* find non-zero elements */
+ eob0 = vtstq_s16(x0, one_q);
+ eob1 = vtstq_s16(x1, one_q);
+
+ /* mask zig zag */
+ eob0 = vandq_u16(eob0, zig_zag0);
+ eob1 = vandq_u16(eob1, zig_zag1);
+
+ /* select the largest value */
+ eob0 = vmaxq_u16(eob0, eob1);
+ eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
+ eob_q32 = vmovl_u16(eob_d16);
+ eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
+ eob_d32 = vpmax_u32(eob_d32, eob_d32);
+
+ /* qcoeff = x */
+ vst1q_s16(d->qcoeff, x0);
+ vst1q_s16(d->qcoeff + 8, x1);
+
+ /* dqcoeff = x * dequant */
+ vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
+ vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
+
+ vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c
deleted file mode 100644
index 80d9ad05414..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/encoder/block.h"
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/entropy.h"
-
-
-#if HAVE_NEON
-
-/* vp8_quantize_mbX functions here differs from corresponding ones in
- * quantize.c only by using quantize_b_pair function pointer instead of
- * the regular quantize_b function pointer */
-void vp8_quantize_mby_neon(MACROBLOCK *x)
-{
- int i;
- int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
- && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
- for (i = 0; i < 16; i+=2)
- x->quantize_b_pair(&x->block[i], &x->block[i+1],
- &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
- if(has_2nd_order)
- x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp8_quantize_mb_neon(MACROBLOCK *x)
-{
- int i;
- int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
- && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
- for (i = 0; i < 24; i+=2)
- x->quantize_b_pair(&x->block[i], &x->block[i+1],
- &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
- if (has_2nd_order)
- x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-
-void vp8_quantize_mbuv_neon(MACROBLOCK *x)
-{
- int i;
-
- for (i = 16; i < 24; i+=2)
- x->quantize_b_pair(&x->block[i], &x->block[i+1],
- &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-}
-
-#endif /* HAVE_NEON */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c
index 9d0e69cf44c..ea279b32181 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c
@@ -159,7 +159,7 @@ static void write_split(vp8_writer *bc, int x)
);
}
-void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
{
const TOKENEXTRA *stop = p + xcount;
unsigned int split;
@@ -374,7 +374,7 @@ static void write_partition_size(unsigned char *cx_data, int size)
}
-static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+static void pack_tokens_into_partitions(VP8_COMP *cpi, unsigned char *cx_data,
unsigned char * cx_data_end,
int num_part)
{
@@ -398,7 +398,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
int tokens = (int)(stop - p);
- vp8_pack_tokens_c(w, p, tokens);
+ vp8_pack_tokens(w, p, tokens);
}
vp8_stop_encode(w);
@@ -407,7 +407,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
}
-static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w)
{
int mb_row;
@@ -417,7 +417,7 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
int tokens = (int)(stop - p);
- vp8_pack_tokens_c(w, p, tokens);
+ vp8_pack_tokens(w, p, tokens);
}
}
@@ -1543,7 +1543,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
if (pc->refresh_entropy_probs == 0)
{
/* save a copy for later refresh */
- vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+ memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
}
vp8_update_coef_probs(cpi);
@@ -1620,7 +1620,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
/* concatenate partition buffers */
for(i = 0; i < num_part; i++)
{
- vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+ memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
cpi->partition_d[i+1] = dp;
dp += cpi->partition_sz[i+1];
}
@@ -1676,7 +1676,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
pack_mb_row_tokens(cpi, &cpi->bc[1]);
else
#endif
- pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
+ vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
vp8_stop_encode(&cpi->bc[1]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
index 66f4bf67ee4..de69805513c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
@@ -16,36 +16,7 @@
extern "C" {
#endif
-#if HAVE_EDSP
-void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
- vp8_token *,
- const vp8_extra_bit_struct *,
- const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
- unsigned char * cx_data,
- const unsigned char *cx_data_end,
- int num_parts,
- vp8_token *,
- const vp8_extra_bit_struct *,
- const vp8_tree_index *);
-void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
- vp8_token *,
- const vp8_extra_bit_struct *,
- const vp8_tree_index *);
-# define pack_tokens(a,b,c) \
- vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_tokens_into_partitions(a,b,c,d) \
- vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_mb_row_tokens(a,b) \
- vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-#else
-
-void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
-
-# define pack_tokens(a,b,c) vp8_pack_tokens_c(a,b,c)
-# define pack_tokens_into_partitions(a,b,c,d) pack_tokens_into_partitions_c(a,b,c,d)
-# define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b)
-#endif
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
index 1f212cae8be..248e79549bf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
@@ -125,6 +125,8 @@ typedef struct macroblock
int optimize;
int q_index;
+ int is_skin;
+ int denoise_zeromv;
#if CONFIG_TEMPORAL_DENOISING
int increase_denoising;
@@ -160,8 +162,9 @@ typedef struct macroblock
void (*short_fdct8x4)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
void (*quantize_b)(BLOCK *b, BLOCKD *d);
- void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
+ unsigned int mbs_zero_last_dot_suppress;
+ int zero_last_dot_suppress;
} MACROBLOCK;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
index 75b2a3be456..d197f8f8166 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
@@ -374,7 +374,7 @@ void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
} else if (mode == 3) {
denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
} else {
- denoiser->denoiser_mode = kDenoiserOnAdaptive;
+ denoiser->denoiser_mode = kDenoiserOnYUV;
}
if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
denoiser->denoise_pars.scale_sse_thresh = 1;
@@ -391,9 +391,9 @@ void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
denoiser->denoise_pars.scale_increase_filter = 1;
denoiser->denoise_pars.denoise_mv_bias = 60;
denoiser->denoise_pars.pickmode_mv_bias = 75;
- denoiser->denoise_pars.qp_thresh = 85;
+ denoiser->denoise_pars.qp_thresh = 80;
denoiser->denoise_pars.consec_zerolast = 15;
- denoiser->denoise_pars.spatial_blur = 20;
+ denoiser->denoise_pars.spatial_blur = 0;
}
}
@@ -415,8 +415,8 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
vp8_denoiser_free(denoiser);
return 1;
}
- vpx_memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
- denoiser->yv12_running_avg[i].frame_size);
+ memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+ denoiser->yv12_running_avg[i].frame_size);
}
denoiser->yv12_mc_running_avg.flags = 0;
@@ -428,19 +428,19 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
return 1;
}
- vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
- denoiser->yv12_mc_running_avg.frame_size);
+ memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+ denoiser->yv12_mc_running_avg.frame_size);
if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
height, VP8BORDERINPIXELS) < 0) {
vp8_denoiser_free(denoiser);
return 1;
}
- vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0,
- denoiser->yv12_last_source.frame_size);
+ memset(denoiser->yv12_last_source.buffer_alloc, 0,
+ denoiser->yv12_last_source.frame_size);
denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
- vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
+ memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
vp8_denoiser_set_parameters(denoiser, mode);
denoiser->nmse_source_diff = 0;
denoiser->nmse_source_diff_count = 0;
@@ -453,16 +453,16 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
// Bitrate thresholds and noise metric (nmse) thresholds for switching to
// aggressive mode.
// TODO(marpan): Adjust thresholds, including effect on resolution.
- denoiser->bitrate_threshold = 300000; // (bits/sec).
- denoiser->threshold_aggressive_mode = 35;
+ denoiser->bitrate_threshold = 400000; // (bits/sec).
+ denoiser->threshold_aggressive_mode = 80;
if (width * height > 1280 * 720) {
- denoiser->bitrate_threshold = 2000000;
- denoiser->threshold_aggressive_mode = 1400;
+ denoiser->bitrate_threshold = 3000000;
+ denoiser->threshold_aggressive_mode = 200;
} else if (width * height > 960 * 540) {
- denoiser->bitrate_threshold = 800000;
- denoiser->threshold_aggressive_mode = 150;
+ denoiser->bitrate_threshold = 1200000;
+ denoiser->threshold_aggressive_mode = 120;
} else if (width * height > 640 * 480) {
- denoiser->bitrate_threshold = 500000;
+ denoiser->bitrate_threshold = 600000;
denoiser->threshold_aggressive_mode = 100;
}
return 0;
@@ -483,7 +483,6 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
vpx_free(denoiser->denoise_state);
}
-
void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
MACROBLOCK *x,
unsigned int best_sse,
@@ -554,6 +553,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
* Note that any changes to the mode info only affects the
* denoising.
*/
+ x->denoise_zeromv = 1;
mbmi->ref_frame =
x->best_zeromv_reference_frame;
@@ -603,6 +603,12 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
NOISE_MOTION_THRESHOLD;
+ // If block is considered to be skin area, lower the motion threshold.
+ // In current version set threshold = 1, so only denoise very low
+ // (i.e., zero) mv on skin.
+ if (x->is_skin)
+ motion_threshold = 1;
+
if (motion_magnitude2 <
denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
x->increase_denoising = 1;
@@ -662,6 +668,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
/* No filtering of this block; it differs too much from the predictor,
* or the motion vector magnitude is considered too big.
*/
+ x->denoise_zeromv = 0;
vp8_copy_mem16x16(
x->thismb, 16,
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
@@ -692,7 +699,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
// Fix filter level to some nominal value for now.
- int filter_level = 32;
+ int filter_level = 48;
int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
index 6c1f9e22baa..9a379a6a168 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
@@ -19,7 +19,7 @@ extern "C" {
#endif
#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (600)
+#define SUM_DIFF_THRESHOLD_HIGH (600) // ~(16 * 16 * 1.5)
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5)
@@ -27,7 +27,7 @@ extern "C" {
#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
-#define MAX_GF_ARF_DENOISE_RANGE (16)
+#define MAX_GF_ARF_DENOISE_RANGE (8)
enum vp8_denoiser_decision
{
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
index aec6b9880c1..378e902c6a4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
@@ -82,6 +82,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
{
unsigned int act;
unsigned int sse;
+ (void)cpi;
/* TODO: This could also be done over smaller areas (8x8), but that would
* require extensive changes elsewhere, as lambda is assumed to be fixed
* over an entire MB in most of the code.
@@ -154,8 +155,8 @@ static void calc_av_activity( VP8_COMP *cpi, int64_t activity_sum )
cpi->common.MBs));
/* Copy map to sort list */
- vpx_memcpy( sortlist, cpi->mb_activity_map,
- sizeof(unsigned int) * cpi->common.MBs );
+ memcpy( sortlist, cpi->mb_activity_map,
+ sizeof(unsigned int) * cpi->common.MBs );
/* Ripple each value down to its correct position */
@@ -522,7 +523,8 @@ void encode_mb_row(VP8_COMP *cpi,
}
#endif
- // Keep track of how many (consecutive) times a block is coded
+
+ // Keep track of how many (consecutive) times a block is coded
// as ZEROMV_LASTREF, for base layer frames.
// Reset to 0 if its coded as anything else.
if (cpi->current_layer == 0) {
@@ -531,9 +533,14 @@ void encode_mb_row(VP8_COMP *cpi,
// Increment, check for wrap-around.
if (cpi->consec_zero_last[map_index+mb_col] < 255)
cpi->consec_zero_last[map_index+mb_col] += 1;
+ if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+ cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
} else {
cpi->consec_zero_last[map_index+mb_col] = 0;
+ cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
}
+ if (x->zero_last_dot_suppress)
+ cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
}
/* Special case code for cyclic refresh
@@ -574,7 +581,7 @@ void encode_mb_row(VP8_COMP *cpi,
/* pack tokens for this MB */
{
int tok_count = *tp - tp_start;
- pack_tokens(w, tp_start, tok_count);
+ vp8_pack_tokens(w, tp_start, tok_count);
}
#endif
/* Increment pointer into gf usage flags structure. */
@@ -658,8 +665,7 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)
x->mvc = cm->fc.mvc;
- vpx_memset(cm->above_context, 0,
- sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+ memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
/* Special case treatment when GF and ARF are not sensible options
* for reference
@@ -737,7 +743,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
const int num_part = (1 << cm->multi_token_partition);
#endif
- vpx_memset(segment_counts, 0, sizeof(segment_counts));
+ memset(segment_counts, 0, sizeof(segment_counts));
totalrate = 0;
if (cpi->compressor_speed == 2)
@@ -967,7 +973,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
int i;
/* Set to defaults */
- vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+ memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
@@ -1143,6 +1149,8 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
while (++b < 16);
}
+#else
+ (void)cpi;
#endif
++x->ymode_count[m];
@@ -1252,7 +1260,6 @@ int vp8cx_encode_inter_macroblock
if(cpi->sf.use_fastquant_for_pick)
{
x->quantize_b = vp8_fast_quantize_b;
- x->quantize_b_pair = vp8_fast_quantize_b_pair;
/* the fast quantizer does not use zbin_extra, so
* do not recalculate */
@@ -1265,7 +1272,6 @@ int vp8cx_encode_inter_macroblock
if (cpi->sf.improved_quant)
{
x->quantize_b = vp8_regular_quantize_b;
- x->quantize_b_pair = vp8_regular_quantize_b_pair;
}
/* restore cpi->zbin_mode_boost_enabled */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c
index eb0619d9597..dfd0a237a5f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c
@@ -506,8 +506,8 @@ static void optimize_mb(MACROBLOCK *x)
ENTROPY_CONTEXT *ta;
ENTROPY_CONTEXT *tl;
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
@@ -555,8 +555,8 @@ void vp8_optimize_mby(MACROBLOCK *x)
if (!x->e_mbd.left_context)
return;
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
@@ -595,8 +595,8 @@ void vp8_optimize_mbuv(MACROBLOCK *x)
if (!x->e_mbd.left_context)
return;
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
index 7b8b51f308b..977b0b0321e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
@@ -215,11 +215,15 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
LAST_FRAME) {
// Increment, check for wrap-around.
if (cpi->consec_zero_last[map_index+mb_col] < 255)
- cpi->consec_zero_last[map_index+mb_col] +=
- 1;
+ cpi->consec_zero_last[map_index+mb_col] += 1;
+ if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+ cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
} else {
cpi->consec_zero_last[map_index+mb_col] = 0;
+ cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
}
+ if (x->zero_last_dot_suppress)
+ cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
}
/* Special case code for cyclic refresh
@@ -261,7 +265,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
/* pack tokens for this MB */
{
int tok_count = tp - tp_start;
- pack_tokens(w, tp_start, tok_count);
+ vp8_pack_tokens(w, tp_start, tok_count);
}
#else
cpi->tplist[mb_row].stop = tp;
@@ -346,7 +350,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->short_fdct8x4 = x->short_fdct8x4;
z->short_walsh4x4 = x->short_walsh4x4;
z->quantize_b = x->quantize_b;
- z->quantize_b_pair = x->quantize_b_pair;
z->optimize = x->optimize;
/*
@@ -413,14 +416,13 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
zd->segmentation_enabled = xd->segmentation_enabled;
zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
- vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data,
- sizeof(xd->segment_feature_data));
+ memcpy(zd->segment_feature_data, xd->segment_feature_data,
+ sizeof(xd->segment_feature_data));
- vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc,
- sizeof(xd->dequant_y1_dc));
- vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
- vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
- vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+ memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+ memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+ memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+ memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
#if 1
/*TODO: Remove dequant from BLOCKD. This is a temporary solution until
@@ -435,15 +437,14 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
#endif
- vpx_memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
- vpx_memcpy(z->rd_thresh_mult, x->rd_thresh_mult,
- sizeof(x->rd_thresh_mult));
+ memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
+ memcpy(z->rd_thresh_mult, x->rd_thresh_mult, sizeof(x->rd_thresh_mult));
z->zbin_over_quant = x->zbin_over_quant;
z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
z->zbin_mode_boost = x->zbin_mode_boost;
- vpx_memset(z->error_bins, 0, sizeof(z->error_bins));
+ memset(z->error_bins, 0, sizeof(z->error_bins));
}
}
@@ -469,7 +470,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
mb->gf_active_ptr = x->gf_active_ptr;
- vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+ memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
mbr_ei[i].totalrate = 0;
mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
@@ -506,6 +507,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
mb->intra_error = 0;
vp8_zero(mb->count_mb_ref_frame_usage);
mb->mbs_tested_so_far = 0;
+ mb->mbs_zero_last_dot_suppress = 0;
}
}
@@ -543,7 +545,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
vpx_malloc(sizeof(sem_t) * th_count));
CHECK_MEM_ERROR(cpi->mb_row_ei,
vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
- vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+ memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
CHECK_MEM_ERROR(cpi->en_thread_data,
vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
index 98e5a7115db..75c1362610f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
@@ -132,6 +132,7 @@ static void output_stats(const VP8_COMP *cpi,
FIRSTPASS_STATS *stats)
{
struct vpx_codec_cx_pkt pkt;
+ (void)cpi;
pkt.kind = VPX_CODEC_STATS_PKT;
pkt.data.twopass_stats.buf = stats;
pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
@@ -418,6 +419,7 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
int raw_stride = raw_buffer->y_stride;
unsigned char *ref_ptr;
int ref_stride = x->e_mbd.pre.y_stride;
+ (void)cpi;
/* Set up pointers for this macro block raw buffer */
raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
@@ -571,7 +573,7 @@ void vp8_first_pass(VP8_COMP *cpi)
{
int flag[2] = {1, 1};
vp8_initialize_rd_consts(cpi, x, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
- vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
}
@@ -1409,6 +1411,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
void vp8_end_second_pass(VP8_COMP *cpi)
{
+ (void)cpi;
}
/* This function gives and estimate of how badly we believe the prediction
@@ -1419,6 +1422,7 @@ static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_fra
double prediction_decay_rate;
double motion_decay;
double motion_pct = next_frame->pcnt_motion;
+ (void)cpi;
/* Initial basis is the % mbs inter coded */
prediction_decay_rate = next_frame->pcnt_inter;
@@ -1547,6 +1551,7 @@ static void accumulate_frame_motion_stats(
double this_frame_mvr_ratio;
double this_frame_mvc_ratio;
double motion_pct;
+ (void)cpi;
/* Accumulate motion stats. */
motion_pct = this_frame->pcnt_motion;
@@ -1774,7 +1779,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
start_pos = cpi->twopass.stats_in;
- vpx_memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
+ memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
/* Load stats for the current frame. */
mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1870,7 +1875,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
break;
}
- vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+ memcpy(this_frame, &next_frame, sizeof(*this_frame));
old_boost_score = boost_score;
}
@@ -2440,7 +2445,7 @@ void vp8_second_pass(VP8_COMP *cpi)
if (cpi->twopass.frames_to_key == 0)
{
/* Define next KF group and assign bits to it */
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
find_next_key_frame(cpi, &this_frame_copy);
/* Special case: Error error_resilient_mode mode does not make much
@@ -2466,7 +2471,7 @@ void vp8_second_pass(VP8_COMP *cpi)
if (cpi->frames_till_gf_update_due == 0)
{
/* Define next gf group and assign bits to it */
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
define_gf_group(cpi, &this_frame_copy);
/* If we are going to code an altref frame at the end of the group
@@ -2482,7 +2487,7 @@ void vp8_second_pass(VP8_COMP *cpi)
* to the GF group
*/
int bak = cpi->per_frame_bandwidth;
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
assign_std_frame_bits(cpi, &this_frame_copy);
cpi->per_frame_bandwidth = bak;
}
@@ -2505,14 +2510,14 @@ void vp8_second_pass(VP8_COMP *cpi)
if (cpi->common.frame_type != KEY_FRAME)
{
/* Assign bits from those allocated to the GF group */
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
assign_std_frame_bits(cpi, &this_frame_copy);
}
}
else
{
/* Assign bits from those allocated to the GF group */
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
assign_std_frame_bits(cpi, &this_frame_copy);
}
}
@@ -2653,7 +2658,7 @@ static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTP
double decay_accumulator = 1.0;
double next_iiratio;
- vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+ memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
/* Note the starting file position so we can reset to it */
start_pos = cpi->twopass.stats_in;
@@ -2730,7 +2735,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
double kf_group_coded_err = 0.0;
double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
- vpx_memset(&next_frame, 0, sizeof(next_frame));
+ memset(&next_frame, 0, sizeof(next_frame));
vp8_clear_system_state();
start_position = cpi->twopass.stats_in;
@@ -2751,7 +2756,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
cpi->twopass.frames_to_key = 1;
/* Take a copy of the initial frame details */
- vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+ memcpy(&first_frame, this_frame, sizeof(*this_frame));
cpi->twopass.kf_group_bits = 0;
cpi->twopass.kf_group_error_left = 0;
@@ -2774,7 +2779,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
kf_group_coded_err += this_frame->coded_error;
/* Load the next frame's stats. */
- vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+ memcpy(&last_frame, this_frame, sizeof(*this_frame));
input_stats(cpi, this_frame);
/* Provided that we are not at the end of the file... */
@@ -2842,7 +2847,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
cpi->twopass.frames_to_key /= 2;
/* Copy first frame details */
- vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+ memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
/* Reset to the start of the group */
reset_fpf_position(cpi, start_position);
@@ -2964,7 +2969,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
*/
decay_accumulator = 1.0;
boost_score = 0.0;
- loop_decay_rate = 1.00; /* Starting decay rate */
for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
{
@@ -3208,7 +3212,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
int new_width = cpi->oxcf.Width;
int new_height = cpi->oxcf.Height;
- int projected_buffer_level = (int)cpi->buffer_level;
+ int projected_buffer_level;
int tmp_q;
double projected_bits_perframe;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
index 545f2c89321..f848e8fb571 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
@@ -9,6 +9,8 @@
*/
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "onyx_int.h"
#include "mcomp.h"
#include "vpx_mem/vpx_mem.h"
@@ -888,6 +890,8 @@ int vp8_hex_search
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+ (void)mvcost;
+
/* adjust ref_mv to make sure it is within MV range */
vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
br = ref_mv->as_mv.row;
@@ -898,7 +902,7 @@ int vp8_hex_search
this_offset = base_offset + (br * (pre_stride)) + bc;
this_mv.as_mv.row = br;
this_mv.as_mv.col = bc;
- bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
+ bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride)
+ mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
#if CONFIG_MULTI_RES_ENCODING
@@ -911,6 +915,8 @@ int vp8_hex_search
else if (search_param >= 1) hex_range = 63;
dia_range = 8;
+#else
+ (void)search_param;
#endif
/* hex search */
@@ -923,7 +929,7 @@ int vp8_hex_search
this_mv.as_mv.row = br + hex[i].row;
this_mv.as_mv.col = bc + hex[i].col;
this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}else
@@ -934,7 +940,7 @@ int vp8_hex_search
this_mv.as_mv.col = bc + hex[i].col;
CHECK_POINT
this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}
@@ -960,7 +966,7 @@ int vp8_hex_search
this_mv.as_mv.row = br + next_chkpts[k][i].row;
this_mv.as_mv.col = bc + next_chkpts[k][i].col;
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}else
@@ -971,7 +977,7 @@ int vp8_hex_search
this_mv.as_mv.col = bc + next_chkpts[k][i].col;
CHECK_POINT
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}
@@ -1002,7 +1008,7 @@ cal_neighbors:
this_mv.as_mv.row = br + neighbors[i].row;
this_mv.as_mv.col = bc + neighbors[i].col;
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}else
@@ -1013,7 +1019,7 @@ cal_neighbors:
this_mv.as_mv.col = bc + neighbors[i].col;
CHECK_POINT
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}
@@ -1097,7 +1103,7 @@ int vp8_diamond_search_sad_c
best_address = in_what;
/* Check the starting position */
- bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* search_param determines the length of the initial step and hence
@@ -1122,7 +1128,7 @@ int vp8_diamond_search_sad_c
{
check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad)
{
@@ -1221,7 +1227,7 @@ int vp8_diamond_search_sadx4
best_address = in_what;
/* Check the starting position */
- bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* search_param determines the length of the initial step and hence the
@@ -1289,7 +1295,7 @@ int vp8_diamond_search_sadx4
(this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
{
check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad)
{
@@ -1372,8 +1378,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
best_mv->as_mv.col = ref_col;
/* Baseline value at the centre */
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
- in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* Apply further limits to prevent us looking using vectors that
@@ -1398,7 +1403,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
for (c = col_min; c < col_max; c++)
{
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
this_mv.as_mv.col = c;
thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
@@ -1470,8 +1475,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
best_mv->as_mv.col = ref_col;
/* Baseline value at the centre */
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
- in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* Apply further limits to prevent us looking using vectors that stretch
@@ -1527,7 +1531,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
while (c < col_max)
{
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad)
{
@@ -1586,7 +1590,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
int col_min = ref_col - distance;
int col_max = ref_col + distance;
- DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+ // TODO(johannkoenig): check if this alignment is necessary.
+ DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
unsigned int sad_array[3];
int *mvsadcost[2];
@@ -1605,8 +1610,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
best_mv->as_mv.col = ref_col;
/* Baseline value at the centre */
- bestsad = fn_ptr->sdf(what, what_stride,
- bestaddress, in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* Apply further limits to prevent us looking using vectors that stretch
@@ -1692,7 +1696,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
while (c < col_max)
{
- thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
if (thissad < bestsad)
{
@@ -1750,8 +1754,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
- bestsad = fn_ptr->sdf(what, what_stride, best_address,
- in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
+ mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
for (i=0; i<search_range; i++)
@@ -1767,7 +1770,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv
(this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
{
check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
if (thissad < bestsad)
{
@@ -1830,8 +1833,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
- bestsad = fn_ptr->sdf(what, what_stride, best_address,
- in_what_stride, UINT_MAX)
+ bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
+ mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
for (i=0; i<search_range; i++)
@@ -1882,7 +1884,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
(this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
{
check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
if (thissad < bestsad)
{
@@ -1974,8 +1976,8 @@ void print_mode_context(void)
#ifdef VP8_ENTROPY_STATS
void init_mv_ref_counts()
{
- vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
- vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+ memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+ memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
}
void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index d8eff669ed5..5b452312ed2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -11,6 +11,8 @@
#include "vpx_config.h"
#include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/blockd.h"
#include "onyx_int.h"
@@ -427,10 +429,10 @@ static void setup_features(VP8_COMP *cpi)
cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
- vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
- vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
- vpx_memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
- vpx_memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+ memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+ memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
set_default_lf_deltas(cpi);
@@ -507,7 +509,7 @@ static void disable_segmentation(VP8_COMP *cpi)
static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
{
/* Copy in the new segmentation map */
- vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+ memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
/* Signal that the map should be updated. */
cpi->mb.e_mbd.update_mb_segmentation_map = 1;
@@ -529,7 +531,7 @@ static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
{
cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
- vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+ memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
}
@@ -579,11 +581,31 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
cpi->cyclic_refresh_q = Q / 2;
+ if (cpi->oxcf.screen_content_mode) {
+ // Modify quality ramp-up based on Q. Above some Q level, increase the
+ // number of blocks to be refreshed, and reduce it below the thredhold.
+ // Turn-off under certain conditions (i.e., away from key frame, and if
+ // we are at good quality (low Q) and most of the blocks were skipped-encoded
+ // in previous frame.
+ if (Q >= 100) {
+ cpi->cyclic_refresh_mode_max_mbs_perframe =
+ (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+ } else if (cpi->frames_since_key > 250 &&
+ Q < 20 &&
+ cpi->mb.skip_true_count > (int)(0.95 * mbs_in_frame)) {
+ cpi->cyclic_refresh_mode_max_mbs_perframe = 0;
+ } else {
+ cpi->cyclic_refresh_mode_max_mbs_perframe =
+ (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+ }
+ block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+ }
+
// Set every macroblock to be eligible for update.
// For key frame this will reset seg map to 0.
- vpx_memset(cpi->segmentation_map, 0, mbs_in_frame);
+ memset(cpi->segmentation_map, 0, mbs_in_frame);
- if (cpi->common.frame_type != KEY_FRAME)
+ if (cpi->common.frame_type != KEY_FRAME && block_count > 0)
{
/* Cycle through the macro_block rows */
/* MB loop to set local segmentation map */
@@ -617,15 +639,18 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
- Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
+ Q < (int)cpi->denoiser.denoise_pars.qp_thresh &&
+ (cpi->frames_since_key >
+ 2 * cpi->denoiser.denoise_pars.consec_zerolast)) {
// Under aggressive denoising, use segmentation to turn off loop
- // filter below some qp thresh. The filter is turned off for all
+ // filter below some qp thresh. The filter is reduced for all
// blocks that have been encoded as ZEROMV LAST x frames in a row,
// where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
// This is to avoid "dot" artifacts that can occur from repeated
// loop filtering on noisy input source.
cpi->cyclic_refresh_q = Q;
- lf_adjustment = -MAX_LOOP_FILTER;
+ // lf_adjustment = -MAX_LOOP_FILTER;
+ lf_adjustment = -40;
for (i = 0; i < mbs_in_frame; ++i) {
seg_map[i] = (cpi->consec_zero_last[i] >
cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
@@ -662,8 +687,8 @@ static void set_default_lf_deltas(VP8_COMP *cpi)
cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
- vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
- vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+ memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
/* Test of ref frame deltas */
cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
@@ -786,6 +811,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
}
cpi->mb.mbs_tested_so_far = 0;
+ cpi->mb.mbs_zero_last_dot_suppress = 0;
/* best quality defaults */
sf->RD = 1;
@@ -853,6 +879,25 @@ void vp8_set_speed_features(VP8_COMP *cpi)
sf->thresh_mult[THR_SPLIT2] =
sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
+ // Special case for temporal layers.
+ // Reduce the thresholds for zero/nearest/near for GOLDEN, if GOLDEN is
+ // used as second reference. We don't modify thresholds for ALTREF case
+ // since ALTREF is usually used as long-term reference in temporal layers.
+ if ((cpi->Speed <= 6) &&
+ (cpi->oxcf.number_of_layers > 1) &&
+ (cpi->ref_frame_flags & VP8_LAST_FRAME) &&
+ (cpi->ref_frame_flags & VP8_GOLD_FRAME)) {
+ if (cpi->closest_reference_frame == GOLDEN_FRAME) {
+ sf->thresh_mult[THR_ZERO2] = sf->thresh_mult[THR_ZERO2] >> 3;
+ sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 3;
+ sf->thresh_mult[THR_NEAR2] = sf->thresh_mult[THR_NEAR2] >> 3;
+ } else {
+ sf->thresh_mult[THR_ZERO2] = sf->thresh_mult[THR_ZERO2] >> 1;
+ sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 1;
+ sf->thresh_mult[THR_NEAR2] = sf->thresh_mult[THR_NEAR2] >> 1;
+ }
+ }
+
cpi->mode_check_freq[THR_ZERO1] =
cpi->mode_check_freq[THR_NEAREST1] =
cpi->mode_check_freq[THR_NEAR1] =
@@ -1043,7 +1088,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (Speed >= 15)
sf->half_pixel_search = 0;
- vpx_memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
+ memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
}; /* switch */
@@ -1083,12 +1128,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (cpi->sf.improved_quant)
{
cpi->mb.quantize_b = vp8_regular_quantize_b;
- cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
}
else
{
cpi->mb.quantize_b = vp8_fast_quantize_b;
- cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
}
if (cpi->sf.improved_quant != last_improved_quant)
vp8cx_init_quantizer(cpi);
@@ -1256,7 +1299,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
CHECK_MEM_ERROR(cpi->active_map,
vpx_calloc(cm->mb_rows * cm->mb_cols,
sizeof(*cpi->active_map)));
- vpx_memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+ memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
#if CONFIG_MULTITHREAD
if (width < 640)
@@ -1363,20 +1406,31 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
cm->version = oxcf->Version;
vp8_setup_version(cm);
- /* frame rate is not available on the first frame, as it's derived from
+ /* Frame rate is not available on the first frame, as it's derived from
* the observed timestamps. The actual value used here doesn't matter
- * too much, as it will adapt quickly. If the reciprocal of the timebase
- * seems like a reasonable framerate, then use that as a guess, otherwise
- * use 30.
+ * too much, as it will adapt quickly.
*/
- cpi->framerate = (double)(oxcf->timebase.den) /
- (double)(oxcf->timebase.num);
+ if (oxcf->timebase.num > 0) {
+ cpi->framerate = (double)(oxcf->timebase.den) /
+ (double)(oxcf->timebase.num);
+ } else {
+ cpi->framerate = 30;
+ }
+ /* If the reciprocal of the timebase seems like a reasonable framerate,
+ * then use that as a guess, otherwise use 30.
+ */
if (cpi->framerate > 180)
cpi->framerate = 30;
cpi->ref_framerate = cpi->framerate;
+ cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->refresh_entropy_probs = 1;
+
/* change includes all joint functionality */
vp8_change_config(cpi, oxcf);
@@ -1597,12 +1651,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
cpi->baseline_gf_interval =
cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
- cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
-
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 1;
- cm->refresh_entropy_probs = 1;
-
#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
cpi->oxcf.token_partitions = 3;
#endif
@@ -1705,13 +1753,25 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
if (cpi->oxcf.number_of_layers != prev_number_of_layers)
{
// If the number of temporal layers are changed we must start at the
- // base of the pattern cycle, so reset temporal_pattern_counter.
+ // base of the pattern cycle, so set the layer id to 0 and reset
+ // the temporal pattern counter.
+ if (cpi->temporal_layer_id > 0) {
+ cpi->temporal_layer_id = 0;
+ }
cpi->temporal_pattern_counter = 0;
reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
}
+ if (!cpi->initial_width)
+ {
+ cpi->initial_width = cpi->oxcf.Width;
+ cpi->initial_height = cpi->oxcf.Height;
+ }
+
cm->Width = cpi->oxcf.Width;
cm->Height = cpi->oxcf.Height;
+ assert(cm->Width <= cpi->initial_width);
+ assert(cm->Height <= cpi->initial_height);
/* TODO(jkoleszar): if an internal spatial resampling is active,
* and we downsize the input image, maybe we should clear the
@@ -1832,7 +1892,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cm = &cpi->common;
- vpx_memset(cpi, 0, sizeof(VP8_COMP));
+ memset(cpi, 0, sizeof(VP8_COMP));
if (setjmp(cm->error.jmp))
{
@@ -1852,6 +1912,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
cpi->common.current_video_frame = 0;
cpi->temporal_pattern_counter = 0;
+ cpi->temporal_layer_id = -1;
cpi->kf_overspend_bits = 0;
cpi->kf_bitrate_adjustment = 0;
cpi->frames_till_gf_update_due = 0;
@@ -1904,6 +1965,8 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
}
#endif
+ cpi->mse_source_denoised = 0;
+
/* Should we use the cyclic refresh method.
* Currently this is tied to error resilliant mode
*/
@@ -1927,7 +1990,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->cyclic_refresh_map = (signed char *) NULL;
CHECK_MEM_ERROR(cpi->consec_zero_last,
- vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+ vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
+ CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+ vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
#ifdef VP8_ENTROPY_STATS
init_context_counters();
@@ -2062,55 +2127,55 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
}
#endif
- cpi->fn_ptr[BLOCK_16X16].sdf = vp8_sad16x16;
+ cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
- cpi->fn_ptr[BLOCK_16X16].sdx3f = vp8_sad16x16x3;
- cpi->fn_ptr[BLOCK_16X16].sdx8f = vp8_sad16x16x8;
- cpi->fn_ptr[BLOCK_16X16].sdx4df = vp8_sad16x16x4d;
+ cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
+ cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
+ cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
- cpi->fn_ptr[BLOCK_16X8].sdf = vp8_sad16x8;
+ cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
- cpi->fn_ptr[BLOCK_16X8].sdx3f = vp8_sad16x8x3;
- cpi->fn_ptr[BLOCK_16X8].sdx8f = vp8_sad16x8x8;
- cpi->fn_ptr[BLOCK_16X8].sdx4df = vp8_sad16x8x4d;
+ cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
+ cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
+ cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
- cpi->fn_ptr[BLOCK_8X16].sdf = vp8_sad8x16;
+ cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
- cpi->fn_ptr[BLOCK_8X16].sdx3f = vp8_sad8x16x3;
- cpi->fn_ptr[BLOCK_8X16].sdx8f = vp8_sad8x16x8;
- cpi->fn_ptr[BLOCK_8X16].sdx4df = vp8_sad8x16x4d;
+ cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
+ cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
+ cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
- cpi->fn_ptr[BLOCK_8X8].sdf = vp8_sad8x8;
+ cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
- cpi->fn_ptr[BLOCK_8X8].sdx3f = vp8_sad8x8x3;
- cpi->fn_ptr[BLOCK_8X8].sdx8f = vp8_sad8x8x8;
- cpi->fn_ptr[BLOCK_8X8].sdx4df = vp8_sad8x8x4d;
+ cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
+ cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
+ cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
- cpi->fn_ptr[BLOCK_4X4].sdf = vp8_sad4x4;
+ cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
- cpi->fn_ptr[BLOCK_4X4].sdx3f = vp8_sad4x4x3;
- cpi->fn_ptr[BLOCK_4X4].sdx8f = vp8_sad4x4x8;
- cpi->fn_ptr[BLOCK_4X4].sdx4df = vp8_sad4x4x4d;
+ cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
+ cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
+ cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
#if ARCH_X86 || ARCH_X86_64
cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn;
@@ -2206,9 +2271,6 @@ void vp8_remove_compressor(VP8_COMP **ptr)
if (cpi->b_calculate_psnr)
{
- YV12_BUFFER_CONFIG *lst_yv12 =
- &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-
if (cpi->oxcf.number_of_layers > 1)
{
int i;
@@ -2220,7 +2282,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
double dr = (double)cpi->bytes_in_layer[i] *
8.0 / 1000.0 / time_encoded;
double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
- lst_yv12->y_width * lst_yv12->y_height;
+ cpi->common.Width * cpi->common.Height;
double total_psnr =
vpx_sse_to_psnr(samples, 255.0,
cpi->total_error2[i]);
@@ -2242,7 +2304,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
else
{
double samples = 3.0 / 2 * cpi->count *
- lst_yv12->y_width * lst_yv12->y_height;
+ cpi->common.Width * cpi->common.Height;
double total_psnr = vpx_sse_to_psnr(samples, 255.0,
cpi->total_sq_error);
double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
@@ -2450,6 +2512,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
vpx_free(cpi->tok);
vpx_free(cpi->cyclic_refresh_map);
vpx_free(cpi->consec_zero_last);
+ vpx_free(cpi->consec_zero_last_mvbias);
vp8_remove_common(&cpi->common);
vpx_free(cpi);
@@ -2805,7 +2868,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
}
/* Update data structure that monitors level of reference to last GF */
- vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
/* this frame refreshes means next frames don't unless specified by user */
@@ -2854,7 +2917,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
}
/* Update data structure that monitors level of reference to last GF */
- vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
/* this frame refreshes means next frames don't unless specified by
@@ -3293,6 +3356,49 @@ static void update_reference_frames(VP8_COMP *cpi)
}
+static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ VP8_COMP *cpi)
+ {
+ int i, j;
+ int Total = 0;
+ int num_blocks = 0;
+ int skip = 2;
+ int min_consec_zero_last = 10;
+ int tot_num_blocks = (source->y_height * source->y_width) >> 8;
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ /* Loop through the Y plane, every |skip| blocks along rows and colmumns,
+ * summing the square differences, and only for blocks that have been
+ * zero_last mode at least |x| frames in a row.
+ */
+ for (i = 0; i < source->y_height; i += 16 * skip)
+ {
+ int block_index_row = (i >> 4) * cpi->common.mb_cols;
+ for (j = 0; j < source->y_width; j += 16 * skip)
+ {
+ int index = block_index_row + (j >> 4);
+ if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+ unsigned int sse;
+ Total += vp8_mse16x16(src + j,
+ source->y_stride,
+ dst + j, dest->y_stride,
+ &sse);
+ num_blocks++;
+ }
+ }
+ src += 16 * skip * source->y_stride;
+ dst += 16 * skip * dest->y_stride;
+ }
+ // Only return non-zero if we have at least ~1/16 samples for estimate.
+ if (num_blocks > (tot_num_blocks >> 4)) {
+ return (Total / num_blocks);
+ } else {
+ return 0;
+ }
+ }
+
#if CONFIG_TEMPORAL_DENOISING
static void process_denoiser_mode_change(VP8_COMP *cpi) {
const VP8_COMMON *const cm = &cpi->common;
@@ -3305,12 +3411,12 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
// Only select blocks for computing nmse that have been encoded
// as ZERO LAST min_consec_zero_last frames in a row.
// Scale with number of temporal layers.
- int min_consec_zero_last = 8 / cpi->oxcf.number_of_layers;
+ int min_consec_zero_last = 12 / cpi->oxcf.number_of_layers;
// Decision is tested for changing the denoising mode every
// num_mode_change times this function is called. Note that this
// function called every 8 frames, so (8 * num_mode_change) is number
// of frames where denoising mode change is tested for switch.
- int num_mode_change = 15;
+ int num_mode_change = 20;
// Framerate factor, to compensate for larger mse at lower framerates.
// Use ref_framerate, which is full source framerate for temporal layers.
// TODO(marpan): Adjust this factor.
@@ -3322,7 +3428,12 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
static const unsigned char const_source[16] = {
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
128, 128, 128};
-
+ int bandwidth = (int)(cpi->target_bandwidth);
+ // For temporal layers, use full bandwidth (top layer).
+ if (cpi->oxcf.number_of_layers > 1) {
+ LAYER_CONTEXT *lc = &cpi->layer_context[cpi->oxcf.number_of_layers - 1];
+ bandwidth = (int)(lc->target_bandwidth);
+ }
// Loop through the Y plane, every skip blocks along rows and columns,
// summing the normalized mean square error, only for blocks that have
// been encoded as ZEROMV LAST at least min_consec_zero_last least frames in
@@ -3334,11 +3445,6 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
int index = block_index_row + (j >> 4);
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
unsigned int sse;
- const unsigned int mse = vp8_mse16x16(src + j,
- ystride,
- dst + j,
- ystride,
- &sse);
const unsigned int var = vp8_variance16x16(src + j,
ystride,
dst + j,
@@ -3347,14 +3453,15 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
// Only consider this block as valid for noise measurement
// if the sum_diff average of the current and previous frame
// is small (to avoid effects from lighting change).
- if ((mse - var) < 256) {
+ if ((sse - var) < 128) {
+ unsigned int sse2;
const unsigned int act = vp8_variance16x16(src + j,
ystride,
const_source,
0,
- &sse);
+ &sse2);
if (act > 0)
- total += mse / act;
+ total += sse / act;
num_blocks++;
}
}
@@ -3370,16 +3477,17 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
if (total > 0 &&
(num_blocks > (tot_num_blocks >> 4))) {
// Update the recursive mean square source_diff.
+ total = (total << 8) / num_blocks;
if (cpi->denoiser.nmse_source_diff_count == 0) {
// First sample in new interval.
cpi->denoiser.nmse_source_diff = total;
cpi->denoiser.qp_avg = cm->base_qindex;
} else {
// For subsequent samples, use average with weight ~1/4 for new sample.
- cpi->denoiser.nmse_source_diff = (int)((total >> 2) +
- 3 * (cpi->denoiser.nmse_source_diff >> 2));
- cpi->denoiser.qp_avg = (int)((cm->base_qindex >> 2) +
- 3 * (cpi->denoiser.qp_avg >> 2));
+ cpi->denoiser.nmse_source_diff = (int)((total +
+ 3 * cpi->denoiser.nmse_source_diff) >> 2);
+ cpi->denoiser.qp_avg = (int)((cm->base_qindex +
+ 3 * cpi->denoiser.qp_avg) >> 2);
}
cpi->denoiser.nmse_source_diff_count++;
}
@@ -3391,7 +3499,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
(cpi->denoiser.nmse_source_diff >
cpi->denoiser.threshold_aggressive_mode) &&
(cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up &&
- cpi->target_bandwidth > cpi->denoiser.bitrate_threshold)) {
+ bandwidth > cpi->denoiser.bitrate_threshold)) {
vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
} else {
// Check for going down: from aggressive to normal mode.
@@ -3400,7 +3508,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
cpi->denoiser.threshold_aggressive_mode)) ||
((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
(cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down ||
- cpi->target_bandwidth < cpi->denoiser.bitrate_threshold))) {
+ bandwidth < cpi->denoiser.bitrate_threshold))) {
vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
}
}
@@ -3416,6 +3524,13 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
{
const FRAME_TYPE frame_type = cm->frame_type;
+ int update_any_ref_buffers = 1;
+ if (cpi->common.refresh_last_frame == 0 &&
+ cpi->common.refresh_golden_frame == 0 &&
+ cpi->common.refresh_alt_ref_frame == 0) {
+ update_any_ref_buffers = 0;
+ }
+
if (cm->no_lpf)
{
cm->filter_level = 0;
@@ -3427,11 +3542,36 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
vp8_clear_system_state();
vpx_usec_timer_start(&timer);
- if (cpi->sf.auto_filter == 0)
+ if (cpi->sf.auto_filter == 0) {
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+ // Use the denoised buffer for selecting base loop filter level.
+ // Denoised signal for current frame is stored in INTRA_FRAME.
+ // No denoising on key frames.
+ vp8cx_pick_filter_level_fast(
+ &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+ } else {
+ vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+ }
+#else
vp8cx_pick_filter_level_fast(cpi->Source, cpi);
-
- else
+#endif
+ } else {
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+ // Use the denoised buffer for selecting base loop filter level.
+ // Denoised signal for current frame is stored in INTRA_FRAME.
+ // No denoising on key frames.
+ vp8cx_pick_filter_level(
+ &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+ } else {
+ vp8cx_pick_filter_level(cpi->Source, cpi);
+ }
+#else
vp8cx_pick_filter_level(cpi->Source, cpi);
+#endif
+ }
+
if (cm->filter_level > 0)
{
@@ -3447,7 +3587,9 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
#endif
- if (cm->filter_level > 0)
+ // No need to apply loop-filter if the encoded frame does not update
+ // any reference buffers.
+ if (cm->filter_level > 0 && update_any_ref_buffers)
{
vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type);
}
@@ -3577,39 +3719,78 @@ static void encode_frame_to_data_rate
}
#if CONFIG_MULTI_RES_ENCODING
- /* In multi-resolution encoding, frame_type is decided by lowest-resolution
- * encoder. Same frame_type is adopted while encoding at other resolution.
- */
- if (cpi->oxcf.mr_encoder_id)
- {
- LOWER_RES_FRAME_INFO* low_res_frame_info
- = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+ if (cpi->oxcf.mr_total_resolutions > 1) {
+ LOWER_RES_FRAME_INFO* low_res_frame_info
+ = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+ if (cpi->oxcf.mr_encoder_id) {
+
+ // TODO(marpan): This constraint shouldn't be needed, as we would like
+ // to allow for key frame setting (forced or periodic) defined per
+ // spatial layer. For now, keep this in.
cm->frame_type = low_res_frame_info->frame_type;
+ // Check if lower resolution is available for motion vector reuse.
if(cm->frame_type != KEY_FRAME)
{
- cpi->mr_low_res_mv_avail = 1;
- cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
-
- if (cpi->ref_frame_flags & VP8_LAST_FRAME)
- cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
- == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
-
- if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
- cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
- == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+ cpi->mr_low_res_mv_avail = 1;
+ cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+ cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+ == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+ cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+ == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+
+ // Don't use altref to determine whether low res is available.
+ // TODO (marpan): Should we make this type of condition on a
+ // per-reference frame basis?
+ /*
+ if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+ cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+ == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+ */
+ }
+ }
- if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
- cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
- == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+ // On a key frame: For the lowest resolution, keep track of the key frame
+ // counter value. For the higher resolutions, reset the current video
+ // frame counter to that of the lowest resolution.
+ // This is done to the handle the case where we may stop/start encoding
+ // higher layer(s). The restart-encoding of higher layer is only signaled
+ // by a key frame for now.
+ // TODO (marpan): Add flag to indicate restart-encoding of higher layer.
+ if (cm->frame_type == KEY_FRAME) {
+ if (cpi->oxcf.mr_encoder_id) {
+ // If the initial starting value of the buffer level is zero (this can
+ // happen because we may have not started encoding this higher stream),
+ // then reset it to non-zero value based on |starting_buffer_level|.
+ if (cpi->common.current_video_frame == 0 && cpi->buffer_level == 0) {
+ unsigned int i;
+ cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+ cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+ for (i = 0; i < cpi->oxcf.number_of_layers; i++) {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ lc->bits_off_target = lc->starting_buffer_level;
+ lc->buffer_level = lc->starting_buffer_level;
+ }
+ }
+ cpi->common.current_video_frame =
+ low_res_frame_info->key_frame_counter_value;
+ } else {
+ low_res_frame_info->key_frame_counter_value =
+ cpi->common.current_video_frame;
}
+ }
+
}
#endif
// Find the reference frame closest to the current frame.
cpi->closest_reference_frame = LAST_FRAME;
- if (cm->frame_type != KEY_FRAME) {
+ if(cm->frame_type != KEY_FRAME) {
int i;
MV_REFERENCE_FRAME closest_ref = INTRA_FRAME;
if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
@@ -3619,12 +3800,12 @@ static void encode_frame_to_data_rate
} else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) {
closest_ref = ALTREF_FRAME;
}
- for (i = 1; i <= 3; i++) {
+ for(i = 1; i <= 3; i++) {
vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t)
((i == 3) ? 4 : i);
if (cpi->ref_frame_flags & ref_frame_type) {
if ((cm->current_video_frame - cpi->current_ref_frames[i]) <
- (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
+ (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
closest_ref = i;
}
}
@@ -3650,7 +3831,9 @@ static void encode_frame_to_data_rate
}
// Reset the zero_last counter to 0 on key frame.
- vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+ memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+ memset(cpi->consec_zero_last_mvbias, 0,
+ (cpi->common.mb_rows * cpi->common.mb_cols));
}
#if 0
@@ -4179,8 +4362,10 @@ static void encode_frame_to_data_rate
else
disable_segmentation(cpi);
}
- // Reset the consec_zero_last counter on key frame.
- vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+ // Reset the zero_last counter to 0 on key frame.
+ memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+ memset(cpi->consec_zero_last_mvbias, 0,
+ (cpi->common.mb_rows * cpi->common.mb_cols));
vp8_set_quantizer(cpi, Q);
}
@@ -4203,7 +4388,7 @@ static void encode_frame_to_data_rate
if (cm->refresh_entropy_probs == 0)
{
/* save a copy for later refresh */
- vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+ memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
}
vp8_update_coef_context(cpi);
@@ -4613,6 +4798,22 @@ static void encode_frame_to_data_rate
cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
#if CONFIG_TEMPORAL_DENOISING
+ // Get some measure of the amount of noise, by measuring the (partial) mse
+ // between source and denoised buffer, for y channel. Partial refers to
+ // computing the sse for a sub-sample of the frame (i.e., skip x blocks along row/column),
+ // and only for blocks in that set that are consecutive ZEROMV_LAST mode.
+ // Do this every ~8 frames, to further reduce complexity.
+ // TODO(marpan): Keep this for now for the case cpi->oxcf.noise_sensitivity < 4,
+ // should be removed in favor of the process_denoiser_mode_change() function below.
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ cpi->oxcf.noise_sensitivity < 4 &&
+ !cpi->oxcf.screen_content_mode &&
+ cpi->frames_since_key%8 == 0 &&
+ cm->frame_type != KEY_FRAME) {
+ cpi->mse_source_denoised = measure_square_diff_partial(
+ &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi->Source, cpi);
+ }
+
// For the adaptive denoising mode (noise_sensitivity == 4), sample the mse
// of source diff (between current and previous frame), and determine if we
// should switch the denoiser mode. Sampling refers to computing the mse for
@@ -4621,6 +4822,7 @@ static void encode_frame_to_data_rate
// constraint on the sum diff between blocks. This process is called every
// ~8 frames, to further reduce complexity.
if (cpi->oxcf.noise_sensitivity == 4 &&
+ !cpi->oxcf.screen_content_mode &&
cpi->frames_since_key % 8 == 0 &&
cm->frame_type != KEY_FRAME) {
process_denoiser_mode_change(cpi);
@@ -4758,6 +4960,13 @@ static void encode_frame_to_data_rate
if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+ // If the frame dropper is not enabled, don't let the buffer level go below
+ // some threshold, given here by -|maximum_buffer_size|. For now we only do
+ // this for screen content input.
+ if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+ cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size)
+ cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
+
/* Rolling monitors of whether we are over or underspending used to
* help regulate min and Max Q in two pass.
*/
@@ -5232,7 +5441,26 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
cpi->ref_framerate = 10000000.0 / avg_duration;
}
-
+#if CONFIG_MULTI_RES_ENCODING
+ if (cpi->oxcf.mr_total_resolutions > 1) {
+ LOWER_RES_FRAME_INFO* low_res_frame_info = (LOWER_RES_FRAME_INFO*)
+ cpi->oxcf.mr_low_res_mode_info;
+ // Frame rate should be the same for all spatial layers in
+ // multi-res-encoding (simulcast), so we constrain the frame for
+ // higher layers to be that of lowest resolution. This is needed
+ // as he application may decide to skip encoding a high layer and
+ // then start again, in which case a big jump in time-stamps will
+ // be received for that high layer, which will yield an incorrect
+ // frame rate (from time-stamp adjustment in above calculation).
+ if (cpi->oxcf.mr_encoder_id) {
+ cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+ }
+ else {
+ // Keep track of frame rate for lowest resolution.
+ low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+ }
+ }
+#endif
if (cpi->oxcf.number_of_layers > 1)
{
unsigned int i;
@@ -5262,8 +5490,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
update_layer_contexts (cpi);
/* Restore layer specific context & set frame rate */
- layer = cpi->oxcf.layer_id[
- cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+ if (cpi->temporal_layer_id >= 0) {
+ layer = cpi->temporal_layer_id;
+ } else {
+ layer = cpi->oxcf.layer_id[
+ cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+ }
restore_layer_context (cpi, layer);
vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
}
@@ -5382,19 +5614,19 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
if (cm->refresh_entropy_probs == 0)
{
- vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+ memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
}
/* Save the contexts separately for alt ref, gold and last. */
/* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
if(cm->refresh_alt_ref_frame)
- vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+ memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
if(cm->refresh_golden_frame)
- vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+ memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
if(cm->refresh_last_frame)
- vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+ memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
/* if its a dropped frame honor the requests on subsequent frames */
if (*size > 0)
@@ -5439,19 +5671,23 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
double frame_psnr;
YV12_BUFFER_CONFIG *orig = cpi->Source;
YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
- int y_samples = orig->y_height * orig->y_width ;
- int uv_samples = orig->uv_height * orig->uv_width ;
+ unsigned int y_width = cpi->common.Width;
+ unsigned int y_height = cpi->common.Height;
+ unsigned int uv_width = (y_width + 1) / 2;
+ unsigned int uv_height = (y_height + 1) / 2;
+ int y_samples = y_height * y_width;
+ int uv_samples = uv_height * uv_width;
int t_samples = y_samples + 2 * uv_samples;
double sq_error;
ye = calc_plane_error(orig->y_buffer, orig->y_stride,
- recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
+ recon->y_buffer, recon->y_stride, y_width, y_height);
ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
- recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+ recon->u_buffer, recon->uv_stride, uv_width, uv_height);
ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
- recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+ recon->v_buffer, recon->uv_stride, uv_width, uv_height);
sq_error = (double)(ye + ue + ve);
@@ -5473,13 +5709,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
vp8_clear_system_state();
ye = calc_plane_error(orig->y_buffer, orig->y_stride,
- pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height);
+ pp->y_buffer, pp->y_stride, y_width, y_height);
ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
- pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+ pp->u_buffer, pp->uv_stride, uv_width, uv_height);
ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
- pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+ pp->v_buffer, pp->uv_stride, uv_width, uv_height);
sq_error2 = (double)(ye + ue + ve);
@@ -5606,6 +5842,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
cpi->common.show_frame_mi = cpi->common.mi;
ret = vp8_post_proc_frame(&cpi->common, dest, flags);
#else
+ (void)flags;
if (cpi->common.frame_to_show)
{
@@ -5698,7 +5935,7 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, uns
{
if (map)
{
- vpx_memcpy(cpi->active_map, map, rows * cols);
+ memcpy(cpi->active_map, map, rows * cols);
cpi->active_map_enabled = 1;
}
else
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
index f0424e69ca5..82d7453902c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
@@ -513,10 +513,18 @@ typedef struct VP8_COMP
signed char *cyclic_refresh_map;
// Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
unsigned char *consec_zero_last;
+ // Counter that is reset when a block is checked for a mode-bias against
+ // ZEROMV_LASTREF.
+ unsigned char *consec_zero_last_mvbias;
// Frame counter for the temporal pattern. Counter is rest when the temporal
// layers are changed dynamically (run-time change).
unsigned int temporal_pattern_counter;
+ // Temporal layer id.
+ int temporal_layer_id;
+
+ // Measure of average squared difference between source and denoised signal.
+ int mse_source_denoised;
#if CONFIG_MULTITHREAD
/* multithread data */
@@ -657,6 +665,9 @@ typedef struct VP8_COMP
int droppable;
+ int initial_width;
+ int initial_height;
+
#if CONFIG_TEMPORAL_DENOISING
VP8_DENOISER denoiser;
#endif
@@ -687,6 +698,7 @@ typedef struct VP8_COMP
#endif
/* The frame number of each reference frames */
unsigned int current_ref_frames[MAX_REF_FRAMES];
+ // Closest reference frame to current frame.
MV_REFERENCE_FRAME closest_reference_frame;
struct rd_costs_struct
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
index 9d5556dcdee..c4c0e7e9e23 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
@@ -40,6 +40,134 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+// Fixed point implementation of a skin color classifier. Skin color
+// is model by a Gaussian distribution in the CbCr color space.
+// See ../../test/skin_color_detector_test.cc where the reference
+// skin color classifier is defined.
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614}; // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157}; // q16
+static const int skin_threshold = 1570636; // q18
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr)
+{
+ const int cb_q6 = cb << 6;
+ const int cr_q6 = cr << 6;
+ const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+ const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+ const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+ const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+ const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+ const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+ const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+ skin_inv_cov[1] * cbcr_diff_q2 +
+ skin_inv_cov[2] * cbcr_diff_q2 +
+ skin_inv_cov[3] * cr_diff_q2;
+ return skin_diff;
+}
+
+static int macroblock_corner_grad(unsigned char* signal, int stride,
+ int offsetx, int offsety, int sgnx, int sgny)
+{
+ int y1 = signal[offsetx * stride + offsety];
+ int y2 = signal[offsetx * stride + offsety + sgny];
+ int y3 = signal[(offsetx + sgnx) * stride + offsety];
+ int y4 = signal[(offsetx + sgnx) * stride + offsety + sgny];
+ return MAX(MAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4));
+}
+
+static int check_dot_artifact_candidate(VP8_COMP *cpi,
+ MACROBLOCK *x,
+ unsigned char *target_last,
+ int stride,
+ unsigned char* last_ref,
+ int mb_row,
+ int mb_col,
+ int channel)
+{
+ int threshold1 = 6;
+ int threshold2 = 3;
+ unsigned int max_num = (cpi->common.MBs) / 10;
+ int grad_last = 0;
+ int grad_source = 0;
+ int index = mb_row * cpi->common.mb_cols + mb_col;
+ // Threshold for #consecutive (base layer) frames using zero_last mode.
+ int num_frames = 30;
+ int shift = 15;
+ if (channel > 0) {
+ shift = 7;
+ }
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ num_frames = 20;
+ }
+ x->zero_last_dot_suppress = 0;
+ // Blocks on base layer frames that have been using ZEROMV_LAST repeatedly
+ // (i.e, at least |x| consecutive frames are candidates for increasing the
+ // rd adjustment for zero_last mode.
+ // Only allow this for at most |max_num| blocks per frame.
+ // Don't allow this for screen content input.
+ if (cpi->current_layer == 0 &&
+ cpi->consec_zero_last_mvbias[index] > num_frames &&
+ x->mbs_zero_last_dot_suppress < max_num &&
+ !cpi->oxcf.screen_content_mode)
+ {
+ // If this block is checked here, label it so we don't check it again until
+ // ~|x| framaes later.
+ x->zero_last_dot_suppress = 1;
+ // Dot artifact is noticeable as strong gradient at corners of macroblock,
+ // for flat areas. As a simple detector for now, we look for a high
+ // corner gradient on last ref, and a smaller gradient on source.
+ // Check 4 corners, return if any satisfy condition.
+ // Top-left:
+ grad_last = macroblock_corner_grad(last_ref, stride, 0, 0, 1, 1);
+ grad_source = macroblock_corner_grad(target_last, stride, 0, 0, 1, 1);
+ if (grad_last >= threshold1 && grad_source <= threshold2)
+ {
+ x->mbs_zero_last_dot_suppress++;
+ return 1;
+ }
+ // Top-right:
+ grad_last = macroblock_corner_grad(last_ref, stride, 0, shift, 1, -1);
+ grad_source = macroblock_corner_grad(target_last, stride, 0, shift, 1, -1);
+ if (grad_last >= threshold1 && grad_source <= threshold2)
+ {
+ x->mbs_zero_last_dot_suppress++;
+ return 1;
+ }
+ // Bottom-left:
+ grad_last = macroblock_corner_grad(last_ref, stride, shift, 0, -1, 1);
+ grad_source = macroblock_corner_grad(target_last, stride, shift, 0, -1, 1);
+ if (grad_last >= threshold1 && grad_source <= threshold2)
+ {
+ x->mbs_zero_last_dot_suppress++;
+ return 1;
+ }
+ // Bottom-right:
+ grad_last = macroblock_corner_grad(last_ref, stride, shift, shift, -1, -1);
+ grad_source = macroblock_corner_grad(target_last, stride, shift, shift, -1, -1);
+ if (grad_last >= threshold1 && grad_source <= threshold2)
+ {
+ x->mbs_zero_last_dot_suppress++;
+ return 1;
+ }
+ return 0;
+ }
+ return 0;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+static int is_skin_color(int y, int cb, int cr)
+{
+ if (y < 40 || y > 220)
+ {
+ return 0;
+ }
+ return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
@@ -52,6 +180,7 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
(void) ref_mv;
(void) error_per_bit;
(void) vfp;
+ (void) mb;
(void) mvcost;
(void) distortion;
(void) sse;
@@ -514,10 +643,17 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
#endif
// Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
- if (this_mode == ZEROMV &&
- x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
- (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME)) {
- this_rd = ((int64_t)this_rd) * rd_adj / 100;
+ // TODO: We should also add condition on distance of closest to current.
+ if(!cpi->oxcf.screen_content_mode &&
+ this_mode == ZEROMV &&
+ x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
+ (denoise_aggressive || (cpi->closest_reference_frame == LAST_FRAME)))
+ {
+ // No adjustment if block is considered to be skin area.
+ if(x->is_skin)
+ rd_adj = 100;
+
+ this_rd = ((int64_t)this_rd) * rd_adj / 100;
}
check_for_encode_breakout(*sse, x);
@@ -597,6 +733,15 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
#endif
int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
+
+#if CONFIG_MULTI_RES_ENCODING
+ int dissim = INT_MAX;
+ int parent_ref_frame = 0;
+ int_mv parent_ref_mv;
+ MB_PREDICTION_MODE parent_mode = 0;
+ int parent_ref_valid = 0;
+#endif
+
int_mv mvp;
int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -607,14 +752,56 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
unsigned char *plane[4][3];
int ref_frame_map[4];
int sign_bias = 0;
+ int dot_artifact_candidate = 0;
+ get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
-#if CONFIG_MULTI_RES_ENCODING
- int dissim = INT_MAX;
- int parent_ref_frame = 0;
- int parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
- int_mv parent_ref_mv;
- MB_PREDICTION_MODE parent_mode = 0;
+ // If the current frame is using LAST as a reference, check for
+ // biasing the mode selection for dot artifacts.
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+ unsigned char* target_y = x->src.y_buffer;
+ unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
+ unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
+ int stride = x->src.y_stride;
+ int stride_uv = x->block[16].src_stride;
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity) {
+ const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
+ target_y =
+ cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
+ stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
+ if (uv_denoise) {
+ target_u =
+ cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer +
+ recon_uvoffset;
+ target_v =
+ cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer +
+ recon_uvoffset;
+ stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+ }
+ }
+#endif
+ dot_artifact_candidate =
+ check_dot_artifact_candidate(cpi, x, target_y, stride,
+ plane[LAST_FRAME][0], mb_row, mb_col, 0);
+ // If not found in Y channel, check UV channel.
+ if (!dot_artifact_candidate) {
+ dot_artifact_candidate =
+ check_dot_artifact_candidate(cpi, x, target_u, stride_uv,
+ plane[LAST_FRAME][1], mb_row, mb_col, 1);
+ if (!dot_artifact_candidate) {
+ dot_artifact_candidate =
+ check_dot_artifact_candidate(cpi, x, target_v, stride_uv,
+ plane[LAST_FRAME][2], mb_row, mb_col, 2);
+ }
+ }
+ }
+#if CONFIG_MULTI_RES_ENCODING
+ // |parent_ref_valid| will be set here if potentially we can do mv resue for
+ // this higher resol (|cpi->oxcf.mr_encoder_id| > 0) frame.
+ // |parent_ref_valid| may be reset depending on |parent_ref_frame| for
+ // the current macroblock below.
+ parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
if (parent_ref_valid)
{
int parent_ref_flag;
@@ -632,24 +819,51 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
* In this event, take the conservative approach of disabling the
* lower res info for this MB.
*/
+
parent_ref_flag = 0;
+ // Note availability for mv reuse is only based on last and golden.
if (parent_ref_frame == LAST_FRAME)
parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME);
else if (parent_ref_frame == GOLDEN_FRAME)
parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME);
- else if (parent_ref_frame == ALTREF_FRAME)
- parent_ref_flag = (cpi->ref_frame_flags & VP8_ALTR_FRAME);
//assert(!parent_ref_frame || parent_ref_flag);
+
+ // If |parent_ref_frame| did not match either last or golden then
+ // shut off mv reuse.
if (parent_ref_frame && !parent_ref_flag)
parent_ref_valid = 0;
+
+ // Don't do mv reuse since we want to allow for another mode besides
+ // ZEROMV_LAST to remove dot artifact.
+ if (dot_artifact_candidate)
+ parent_ref_valid = 0;
+ }
+#endif
+
+ // Check if current macroblock is in skin area.
+ {
+ const int y = x->src.y_buffer[7 * x->src.y_stride + 7];
+ const int cb = x->src.u_buffer[3 * x->src.uv_stride + 3];
+ const int cr = x->src.v_buffer[3 * x->src.uv_stride + 3];
+ x->is_skin = 0;
+ if (!cpi->oxcf.screen_content_mode)
+ x->is_skin = is_skin_color(y, cb, cr);
+ }
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity) {
+ // Under aggressive denoising mode, should we use skin map to reduce denoiser
+ // and ZEROMV bias? Will need to revisit the accuracy of this detection for
+ // very noisy input. For now keep this as is (i.e., don't turn it off).
+ // if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive)
+ // x->is_skin = 0;
}
#endif
mode_mv = mode_mv_sb[sign_bias];
best_ref_mv.as_int = 0;
- vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
- vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+ memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+ memset(&best_mbmode, 0, sizeof(best_mbmode));
/* Setup search priorities */
#if CONFIG_MULTI_RES_ENCODING
@@ -680,8 +894,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
}
- get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
-
/* Count of the number of MBs tested so far this frame */
x->mbs_tested_so_far++;
@@ -691,9 +903,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
/* If the frame has big static background and current MB is in low
- * motion area, its mode decision is biased to ZEROMV mode.
- */
- calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+ * motion area, its mode decision is biased to ZEROMV mode.
+ * No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
+ * At such speed settings, ZEROMV is already heavily favored.
+ */
+ if (cpi->Speed < 12) {
+ calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+ }
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity) {
@@ -702,6 +918,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
}
#endif
+ if (dot_artifact_candidate)
+ {
+ // Bias against ZEROMV_LAST mode.
+ rd_adjustment = 150;
+ }
+
+
/* if we encode a new mv this is important
* find the best new motion vector
*/
@@ -887,14 +1110,17 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
step_param = cpi->sf.first_step + speed_adjust;
#if CONFIG_MULTI_RES_ENCODING
- /* If lower-res drops this frame, then higher-res encoder does
- motion search without any previous knowledge. Also, since
- last frame motion info is not stored, then we can not
+ /* If lower-res frame is not available for mv reuse (because of
+ frame dropping or different temporal layer pattern), then higher
+ resol encoder does motion search without any previous knowledge.
+ Also, since last frame motion info is not stored, then we can not
use improved_mv_pred. */
- if (cpi->oxcf.mr_encoder_id && !parent_ref_valid)
+ if (cpi->oxcf.mr_encoder_id)
sf_improved_mv_pred = 0;
- if (parent_ref_valid && parent_ref_frame)
+ // Only use parent MV as predictor if this candidate reference frame
+ // (|this_ref_frame|) is equal to |parent_ref_frame|.
+ if (parent_ref_valid && (parent_ref_frame == this_ref_frame))
{
/* Use parent MV as predictor. Adjust search range
* accordingly.
@@ -938,7 +1164,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
}
#if CONFIG_MULTI_RES_ENCODING
- if (parent_ref_valid && parent_ref_frame && dissim <= 2 &&
+ if (parent_ref_valid && (parent_ref_frame == this_ref_frame) &&
+ dissim <= 2 &&
MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
{
@@ -975,10 +1202,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
* change the behavior in lowest-resolution encoder.
* Will improve it later.
*/
- /* Set step_param to 0 to ensure large-range motion search
- when encoder drops this frame at lower-resolution.
- */
- if (!parent_ref_valid)
+ /* Set step_param to 0 to ensure large-range motion search
+ * when mv reuse if not valid (i.e. |parent_ref_valid| = 0),
+ * or if this candidate reference frame (|this_ref_frame|) is
+ * not equal to |parent_ref_frame|.
+ */
+ if (!parent_ref_valid || (parent_ref_frame != this_ref_frame))
step_param = 0;
#endif
bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
@@ -1080,7 +1309,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity)
{
-
/* Store for later use by denoiser. */
// Dont' denoise with GOLDEN OR ALTREF is they are old reference
// frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
@@ -1096,7 +1324,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
x->e_mbd.mode_info_context->mbmi.ref_frame;
}
- /* Store the best NEWMV in x for later use in the denoiser. */
+ // Store the best NEWMV in x for later use in the denoiser.
if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
sse < best_sse && !skip_old_reference)
{
@@ -1120,8 +1348,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
*returndistortion = distortion2;
best_rd_sse = sse;
best_rd = this_rd;
- vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
- sizeof(MB_MODE_INFO));
+ memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+ sizeof(MB_MODE_INFO));
/* Testing this mode gave rise to an improvement in best error
* score. Lower threshold a bit for next time
@@ -1184,6 +1412,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
if (cpi->oxcf.noise_sensitivity)
{
int block_index = mb_row * cpi->common.mb_cols + mb_col;
+ int reevaluate = 0;
+ int is_noisy = 0;
if (x->best_sse_inter_mode == DC_PRED)
{
/* No best MV found. */
@@ -1193,18 +1423,52 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
x->best_reference_frame = best_mbmode.ref_frame;
best_sse = best_rd_sse;
}
+ // For non-skin blocks that have selected ZEROMV for this current frame,
+ // and have been selecting ZEROMV_LAST (on the base layer frame) at
+ // least |x~20| consecutive past frames in a row, label the block for
+ // possible increase in denoising strength. We also condition this
+ // labeling on there being significant denoising in the scene
+ if (cpi->oxcf.noise_sensitivity == 4) {
+ if (cpi->denoiser.nmse_source_diff >
+ 70 * cpi->denoiser.threshold_aggressive_mode / 100)
+ is_noisy = 1;
+ } else {
+ if (cpi->mse_source_denoised > 1000)
+ is_noisy = 1;
+ }
x->increase_denoising = 0;
+ if (!x->is_skin &&
+ x->best_sse_inter_mode == ZEROMV &&
+ (x->best_reference_frame == LAST_FRAME ||
+ x->best_reference_frame == cpi->closest_reference_frame) &&
+ cpi->consec_zero_last[block_index] >= 20 &&
+ is_noisy) {
+ x->increase_denoising = 1;
+ }
+ x->denoise_zeromv = 0;
vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
recon_yoffset, recon_uvoffset,
&cpi->common.lf_info, mb_row, mb_col,
block_index);
- /* Reevaluate ZEROMV after denoising. */
- if (best_mbmode.ref_frame == INTRA_FRAME &&
+ // Reevaluate ZEROMV after denoising: for large noise content
+ // (i.e., cpi->mse_source_denoised is above threshold), do this for all
+ // blocks that did not pick ZEROMV as best mode but are using ZEROMV
+ // for denoising. Otherwise, always re-evaluate for blocks that picked
+ // INTRA mode as best mode.
+ // Avoid blocks that have been biased against ZERO_LAST
+ // (i.e., dot artifact candidate blocks).
+ reevaluate = (best_mbmode.ref_frame == INTRA_FRAME) ||
+ (best_mbmode.mode != ZEROMV &&
+ x->denoise_zeromv &&
+ cpi->mse_source_denoised > 2000);
+ if (!dot_artifact_candidate &&
+ reevaluate &&
x->best_zeromv_reference_frame != INTRA_FRAME)
{
int this_rd = 0;
int this_ref_frame = x->best_zeromv_reference_frame;
+ rd_adjustment = 100;
rate2 = x->ref_frame_cost[this_ref_frame] +
vp8_cost_mv_ref(ZEROMV, mdcounts);
distortion2 = 0;
@@ -1223,8 +1487,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
if (this_rd < best_rd)
{
- vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
- sizeof(MB_MODE_INFO));
+ memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+ sizeof(MB_MODE_INFO));
}
}
@@ -1248,8 +1512,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
/* set to the best mb mode, this copy can be skip if x->skip since it
* already has the right content */
if (!x->skip)
- vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
- sizeof(MB_MODE_INFO));
+ memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+ sizeof(MB_MODE_INFO));
if (best_mbmode.mode <= B_PRED)
{
@@ -1264,7 +1528,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
update_mvcount(x, &best_ref_mv);
}
-
void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
{
int error4x4, error16x16 = INT_MAX;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c
index f0c8f28fc96..890053dcfdc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c
@@ -49,7 +49,7 @@ static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
src_y = src_ybc->y_buffer + yoffset;
dst_y = dst_ybc->y_buffer + yoffset;
- vpx_memcpy(dst_y, src_y, ystride * linestocopy);
+ memcpy(dst_y, src_y, ystride * linestocopy);
}
static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
@@ -142,7 +142,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
int filt_val;
- int best_filt_val = cm->filter_level;
+ int best_filt_val;
YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
/* Replace unfiltered frame buffer with a new one */
@@ -274,8 +274,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
int filter_step;
int filt_high = 0;
- /* Start search at previous frame filter level */
- int filt_mid = cm->filter_level;
+ int filt_mid;
int filt_low = 0;
int filt_best;
int filt_direction = 0;
@@ -287,7 +286,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
- vpx_memset(ss_err, 0, sizeof(ss_err));
+ memset(ss_err, 0, sizeof(ss_err));
/* Replace unfiltered frame buffer with a new one */
cm->frame_to_show = &cpi->pick_lf_lvl_frame;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c
deleted file mode 100644
index 63f23578467..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *coeff, short *dqcoeff);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-// ppc
-extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp8_sad16x16_ppc;
-extern SADFunction vp8_sad16x8_ppc;
-extern SADFunction vp8_sad8x16_ppc;
-extern SADFunction vp8_sad8x8_ppc;
-extern SADFunction vp8_sad4x4_ppc;
-
-extern variance_function vp8_variance16x16_ppc;
-extern variance_function vp8_variance8x16_ppc;
-extern variance_function vp8_variance16x8_ppc;
-extern variance_function vp8_variance8x8_ppc;
-extern variance_function vp8_variance4x4_ppc;
-extern variance_function vp8_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-
-void vp8_cmachine_specific_config(void)
-{
- // Pure C:
- vp8_mbuverror = vp8_mbuverror_c;
- vp8_fast_quantize_b = vp8_fast_quantize_b_c;
- vp8_short_fdct4x4 = vp8_short_fdct4x4_ppc;
- vp8_short_fdct8x4 = vp8_short_fdct8x4_ppc;
- vp8_fast_fdct4x4 = vp8_short_fdct4x4_ppc;
- vp8_fast_fdct8x4 = vp8_short_fdct8x4_ppc;
- short_walsh4x4 = vp8_short_walsh4x4_c;
-
- vp8_variance4x4 = vp8_variance4x4_ppc;
- vp8_variance8x8 = vp8_variance8x8_ppc;
- vp8_variance8x16 = vp8_variance8x16_ppc;
- vp8_variance16x8 = vp8_variance16x8_ppc;
- vp8_variance16x16 = vp8_variance16x16_ppc;
- vp8_mse16x16 = vp8_mse16x16_ppc;
-
- vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_ppc;
- vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_ppc;
- vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_ppc;
- vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ppc;
- vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc;
-
- vp8_get_mb_ss = vp8_get_mb_ss_c;
- vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
-
- vp8_sad16x16 = vp8_sad16x16_ppc;
- vp8_sad16x8 = vp8_sad16x8_ppc;
- vp8_sad8x16 = vp8_sad8x16_ppc;
- vp8_sad8x8 = vp8_sad8x8_ppc;
- vp8_sad4x4 = vp8_sad4x4_ppc;
-
- vp8_block_error = vp8_block_error_ppc;
- vp8_mbblock_error = vp8_mbblock_error_c;
-
- vp8_subtract_b = vp8_subtract_b_c;
- vp8_subtract_mby = vp8_subtract_mby_ppc;
- vp8_subtract_mbuv = vp8_subtract_mbuv_ppc;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
deleted file mode 100644
index 6e0099ddc88..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null
@@ -1,153 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_subtract_mbuv_ppc
- .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf000
- mtspr 256, r12 ;# set VRSAVE
-
- li r9, 256
- add r3, r3, r9
- add r3, r3, r9
- add r6, r6, r9
-
- li r10, 16
- li r9, 4
- mtctr r9
-
- vspltisw v0, 0
-
-mbu_loop:
- lvsl v5, 0, r4 ;# permutate value for alignment
- lvx v1, 0, r4 ;# src
- lvx v2, 0, r6 ;# pred
-
- add r4, r4, r7
- addi r6, r6, 16
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- lvsl v5, 0, r4 ;# permutate value for alignment
- lvx v1, 0, r4 ;# src
-
- add r4, r4, r7
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrglb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mbu_loop
-
- mtctr r9
-
-mbv_loop:
- lvsl v5, 0, r5 ;# permutate value for alignment
- lvx v1, 0, r5 ;# src
- lvx v2, 0, r6 ;# pred
-
- add r5, r5, r7
- addi r6, r6, 16
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- lvsl v5, 0, r5 ;# permutate value for alignment
- lvx v1, 0, r5 ;# src
-
- add r5, r5, r7
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrglb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mbv_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf800
- mtspr 256, r12 ;# set VRSAVE
-
- li r10, 16
- mtctr r10
-
- vspltisw v0, 0
-
-mby_loop:
- lvx v1, 0, r4 ;# src
- lvx v2, 0, r5 ;# pred
-
- add r4, r4, r6
- addi r5, r5, 16
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vmrglb v3, v0, v1 ;# unpack low src to short
- vmrglb v4, v0, v2 ;# unpack low pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mby_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm
deleted file mode 100644
index 935d0cb0977..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null
@@ -1,205 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_short_fdct4x4_ppc
- .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;# in normalization (fwd is twice unitary, inv is half unitary)
-;# and that they are of course transposes of each other.
-;#
-;# The following three accomplish most of implementation and
-;# are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfffc
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- li r6, 16
-
- load_c v0, dct_tab, 0, r9, r10
- lvx v1, r6, r10
- addi r10, r10, 32
- lvx v2, 0, r10
- lvx v3, r6, r10
-
- load_c v4, ppc_dctperm_tab, 0, r9, r10
- load_c v5, ppc_dctperm_tab, r6, r9, r10
-
- load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
-;# a/A are the even rows 0,2 b/B are the odd rows 1,3
-;# For fwd transform, indices are horizontal positions, then frequencies.
-;# For inverse transform, frequencies then positions.
-;# The two resulting A0..A3 B0..B3 are later combined
-;# and vertically transformed.
-
-.macro two_rows_horiz Dst
- vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
-
- vmsumshm v10, v0, v8, v6
- vmsumshm v10, v1, v9, v10
- vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
-
- vmsumshm v11, v2, v8, v6
- vmsumshm v11, v3, v9, v11
- vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
-
- vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
- vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;# forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
- vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
- vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
- vmsumshm v8, v8, v12, v6
- vmsumshm v8, v9, v13, v8
- vsraw v10, v8, v7
-
- vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
- vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
- vmsumshm v8, v8, v12, v6
- vmsumshm v8, v9, v13, v8
- vsraw v8, v8, v7
-
- vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
-.endm
-
-.macro two_rows_h Dest
- stw r0, 0(r8)
- lwz r0, 4(r3)
- stw r0, 4(r8)
- lwzux r0, r3,r5
- stw r0, 8(r8)
- lwz r0, 4(r3)
- stw r0, 12(r8)
- lvx v8, 0,r8
- two_rows_horiz \Dest
-.endm
-
- .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
- prologue
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
-
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- epilogue
-
- blr
-
- .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
- prologue
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
- addi r10, r3, 0
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- ;# Next block
- addi r3, r10, 8
- addi r4, r4, 32
- lvx v6, 0, r9 ;# v6 = Hround
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- epilogue
-
- blr
-
- .data
- .align 4
-ppc_dctperm_tab:
- .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
- .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
- .align 4
-dct_tab:
- .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
- .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
- .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
- .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
- .align 4
-round_tab:
- .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
- .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
deleted file mode 100644
index ba482300973..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null
@@ -1,51 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_block_error_ppc
-
- .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf800
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- stw r5, 12(r1) ;# tranfer dc to vector register
-
- lvx v0, 0, r3 ;# Coeff
- lvx v1, 0, r4 ;# dqcoeff
-
- li r10, 16
-
- vspltisw v3, 0
-
- vsubshs v0, v0, v1
-
- vmsumshm v2, v0, v0, v3 ;# multiply differences
-
- lvx v0, r10, r3 ;# Coeff
- lvx v1, r10, r4 ;# dqcoeff
-
- vsubshs v0, v0, v1
-
- vmsumshm v1, v0, v0, v2 ;# multiply differences
- vsumsws v1, v1, v3 ;# sum up
-
- stvx v1, 0, r1
- lwz r3, 12(r1) ;# return value
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c
index 9953bd686f4..c5a7bc67039 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c
@@ -65,8 +65,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
short *dequant_ptr = d->dequant;
short zbin_oq_value = b->zbin_extra;
- vpx_memset(qcoeff_ptr, 0, 32);
- vpx_memset(dqcoeff_ptr, 0, 32);
+ memset(qcoeff_ptr, 0, 32);
+ memset(dqcoeff_ptr, 0, 32);
eob = -1;
@@ -101,7 +101,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
*d->eob = (char)(eob + 1);
}
-void vp8_quantize_mby_c(MACROBLOCK *x)
+void vp8_quantize_mby(MACROBLOCK *x)
{
int i;
int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -114,7 +114,7 @@ void vp8_quantize_mby_c(MACROBLOCK *x)
x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
}
-void vp8_quantize_mb_c(MACROBLOCK *x)
+void vp8_quantize_mb(MACROBLOCK *x)
{
int i;
int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -125,7 +125,7 @@ void vp8_quantize_mb_c(MACROBLOCK *x)
}
-void vp8_quantize_mbuv_c(MACROBLOCK *x)
+void vp8_quantize_mbuv(MACROBLOCK *x)
{
int i;
@@ -133,23 +133,6 @@ void vp8_quantize_mbuv_c(MACROBLOCK *x)
x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
}
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
- vp8_regular_quantize_b(b1, d1);
- vp8_regular_quantize_b(b2, d2);
-}
-
-void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
- vp8_fast_quantize_b_c(b1, d1);
- vp8_fast_quantize_b_c(b2, d2);
-}
-
-
static const int qrounding_factors[129] =
{
48, 48, 48, 48, 48, 48, 48, 48,
@@ -552,6 +535,7 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
int update = 0;
int new_delta_q;
+ int new_uv_delta_q;
cm->base_qindex = Q;
/* if any of the delta_q values are changing update flag has to be set */
@@ -559,8 +543,6 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
cm->y1dc_delta_q = 0;
cm->y2ac_delta_q = 0;
- cm->uvdc_delta_q = 0;
- cm->uvac_delta_q = 0;
if (Q < 4)
{
@@ -572,6 +554,21 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
update |= cm->y2dc_delta_q != new_delta_q;
cm->y2dc_delta_q = new_delta_q;
+ new_uv_delta_q = 0;
+ // For screen content, lower the q value for UV channel. For now, select
+ // conservative delta; same delta for dc and ac, and decrease it with lower
+ // Q, and set to 0 below some threshold. May want to condition this in
+ // future on the variance/energy in UV channel.
+ if (cpi->oxcf.screen_content_mode && Q > 40) {
+ new_uv_delta_q = -(int)(0.15 * Q);
+ // Check range: magnitude of delta is 4 bits.
+ if (new_uv_delta_q < -15) {
+ new_uv_delta_q = -15;
+ }
+ }
+ update |= cm->uvdc_delta_q != new_uv_delta_q;
+ cm->uvdc_delta_q = new_uv_delta_q;
+ cm->uvac_delta_q = new_uv_delta_q;
/* Set Segment specific quatizers */
mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
index c739b2627b6..7d36c2b45fe 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
@@ -18,6 +18,9 @@ extern "C" {
struct VP8_COMP;
struct macroblock;
+extern void vp8_quantize_mb(struct macroblock *x);
+extern void vp8_quantize_mby(struct macroblock *x);
+extern void vp8_quantize_mbuv(struct macroblock *x);
extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
index c51650c3c26..25d7a4998cb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
@@ -296,7 +296,7 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
vp8_default_coef_probs(& cpi->common);
- vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
{
int flag[2] = {1, 1};
vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
@@ -305,9 +305,9 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
/* Make sure we initialize separate contexts for altref,gold, and normal.
* TODO shouldn't need 3 different copies of structure to do this!
*/
- vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
- vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
- vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+ memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+ memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+ memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
@@ -708,7 +708,13 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
Adjustment = (cpi->this_frame_target - min_frame_target);
if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
- cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+ {
+ Adjustment = (cpi->current_gf_interval - 1) * Adjustment;
+ // Limit adjustment to 10% of current target.
+ if (Adjustment > (10 * cpi->this_frame_target) / 100)
+ Adjustment = (10 * cpi->this_frame_target) / 100;
+ cpi->this_frame_target += Adjustment;
+ }
else
cpi->this_frame_target -= Adjustment;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
index 2f6f5d07c81..9ccd85eb93f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
@@ -555,8 +555,8 @@ static int vp8_rdcost_mby(MACROBLOCK *mb)
ENTROPY_CONTEXT *ta;
ENTROPY_CONTEXT *tl;
- vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
@@ -650,8 +650,8 @@ static int rd_pick_intra4x4block(
* a temp buffer that meets the stride requirements, but we are only
* interested in the left 4x4 block
* */
- DECLARE_ALIGNED_ARRAY(16, unsigned char, best_predictor, 16*4);
- DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+ DECLARE_ALIGNED(16, unsigned char, best_predictor[16*4]);
+ DECLARE_ALIGNED(16, short, best_dqcoeff[16]);
int dst_stride = x->e_mbd.dst.y_stride;
unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
@@ -691,7 +691,7 @@ static int rd_pick_intra4x4block(
*a = tempa;
*l = templ;
copy_predictor(best_predictor, b->predictor);
- vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+ memcpy(best_dqcoeff, b->dqcoeff, 32);
}
}
b->bmi.as_mode = *best_mode;
@@ -715,8 +715,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
ENTROPY_CONTEXT *tl;
const int *bmode_costs;
- vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
@@ -820,8 +820,8 @@ static int rd_cost_mbuv(MACROBLOCK *mb)
ENTROPY_CONTEXT *ta;
ENTROPY_CONTEXT *tl;
- vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
@@ -837,6 +837,9 @@ static int rd_cost_mbuv(MACROBLOCK *mb)
static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
int *distortion, int fullpixel)
{
+ (void)cpi;
+ (void)fullpixel;
+
vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
vp8_subtract_mbuv(x->src_diff,
x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
@@ -854,6 +857,9 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
int *distortion, int fullpixel)
{
+ (void)cpi;
+ (void)fullpixel;
+
vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
vp8_subtract_mbuv(x->src_diff,
x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
@@ -1122,8 +1128,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
ENTROPY_CONTEXT *ta_b;
ENTROPY_CONTEXT *tl_b;
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
ta = (ENTROPY_CONTEXT *)&t_above;
tl = (ENTROPY_CONTEXT *)&t_left;
@@ -1166,8 +1172,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
ENTROPY_CONTEXT *ta_s;
ENTROPY_CONTEXT *tl_s;
- vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
ta_s = (ENTROPY_CONTEXT *)&t_above_s;
tl_s = (ENTROPY_CONTEXT *)&t_left_s;
@@ -1323,14 +1329,14 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
mode_selected = this_mode;
best_label_rd = this_rd;
- vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
}
} /*for each 4x4 mode*/
- vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
bsi->ref_mv, x->mvcost);
@@ -1386,7 +1392,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
int i;
BEST_SEG_INFO bsi;
- vpx_memset(&bsi, 0, sizeof(bsi));
+ memset(&bsi, 0, sizeof(bsi));
bsi.segment_rd = best_rd;
bsi.ref_mv = best_ref_mv;
@@ -1655,7 +1661,6 @@ void vp8_mv_pred
mv.as_mv.row = mvx[vcnt/2];
mv.as_mv.col = mvy[vcnt/2];
- find = 1;
/* sr is set to 0 to allow calling function to decide the search
* range.
*/
@@ -1685,16 +1690,16 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
}else if(xd->mb_to_top_edge==0)
{ /* only has left MB for sad calculation. */
near_sad[0] = near_sad[2] = INT_MAX;
- near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+ near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
}else if(xd->mb_to_left_edge ==0)
{ /* only has left MB for sad calculation. */
near_sad[1] = near_sad[2] = INT_MAX;
- near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+ near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
}else
{
- near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
- near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
- near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
+ near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
+ near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
+ near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride);
}
if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1709,14 +1714,14 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
if(near_sad[4] != INT_MAX)
- near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
+ near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride);
if(near_sad[5] != INT_MAX)
- near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
- near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
+ near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride);
+ near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride);
if(near_sad[6] != INT_MAX)
- near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
+ near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride);
if(near_sad[7] != INT_MAX)
- near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
+ near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride);
}
if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1920,8 +1925,8 @@ static void update_best_mode(BEST_MODE* best_mode, int this_rd,
(rd->distortion2-rd->distortion_uv));
best_mode->rd = this_rd;
- vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
- vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+ memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+ memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
if ((this_mode == B_PRED) || (this_mode == SPLITMV))
{
@@ -1983,9 +1988,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
best_mode.rd = INT_MAX;
best_mode.yrd = INT_MAX;
best_mode.intra_rd = INT_MAX;
- vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
- vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
- vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
+ memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+ memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+ memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
/* Setup search priorities */
get_reference_search_order(cpi, ref_frame_map);
@@ -2287,7 +2292,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
/* Further step/diamond searches as necessary */
- n = 0;
further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
n = num00;
@@ -2554,8 +2558,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
intra_rd_penalty, cpi, x);
if (this_rd < best_mode.rd || x->skip)
{
- /* Note index of best mode so far */
- best_mode_index = mode_index;
*returnrate = rd.rate2;
*returndistortion = rd.distortion2;
update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
@@ -2580,7 +2582,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
/* macroblock modes */
- vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
+ memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
if (best_mode.mbmode.mode == B_PRED)
{
@@ -2593,7 +2595,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
for (i = 0; i < 16; i++)
xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
- vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+ memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
x->e_mbd.mode_info_context->mbmi.mv.as_int =
x->partition_info->bmi[15].mv.as_int;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c
index 37972e219a0..fdd22fceb6e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c
@@ -23,7 +23,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x)
if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
{
/* Reset Gf useage monitors */
- vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
}
else
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
index 4dc0d959221..ba8b0097710 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
@@ -163,6 +163,8 @@ static int vp8_temporal_filter_find_matching_mb_c
int pre = d->offset;
int pre_stride = x->e_mbd.pre.y_stride;
+ (void)error_thresh;
+
best_ref_mv1.as_int = 0;
best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
@@ -236,12 +238,12 @@ static void vp8_temporal_filter_iterate_c
int mb_rows = cpi->common.mb_rows;
int mb_y_offset = 0;
int mb_uv_offset = 0;
- DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
- DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
+ DECLARE_ALIGNED(16, unsigned int, accumulator[16*16 + 8*8 + 8*8]);
+ DECLARE_ALIGNED(16, unsigned short, count[16*16 + 8*8 + 8*8]);
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
unsigned char *dst1, *dst2;
- DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16*16 + 8*8 + 8*8);
+ DECLARE_ALIGNED(16, unsigned char, predictor[16*16 + 8*8 + 8*8]);
/* Save input state */
unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -272,8 +274,8 @@ static void vp8_temporal_filter_iterate_c
int i, j, k;
int stride;
- vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
- vpx_memset(count, 0, 384*sizeof(unsigned short));
+ memset(accumulator, 0, 384*sizeof(unsigned int));
+ memset(count, 0, 384*sizeof(unsigned short));
#if ALT_REF_MC_ENABLED
cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
@@ -500,7 +502,7 @@ void vp8_temporal_filter_prepare_c
start_frame = distance + frames_to_blur_forward;
/* Setup frame pointers, NULL indicates frame not included in filter */
- vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+ memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
for (frame = 0; frame < frames_to_blur; frame++)
{
int which_buffer = start_frame - frame;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
index 2dc8205278b..afd46fb2197 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
@@ -421,7 +421,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
void init_context_counters(void)
{
- vpx_memset(context_counters, 0, sizeof(context_counters));
+ memset(context_counters, 0, sizeof(context_counters));
}
void print_context_counters()
@@ -596,13 +596,13 @@ void vp8_fix_contexts(MACROBLOCKD *x)
/* Clear entropy contexts for Y2 blocks */
if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
{
- vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
}
else
{
- vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
- vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c
deleted file mode 100644
index a4169b32f6a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_config.h"
-#include "block.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "tokenize.h"
-
-BEGIN
-
-/* regular quantize */
-DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff));
-DEFINE(vp8_block_zbin, offsetof(BLOCK, zbin));
-DEFINE(vp8_block_round, offsetof(BLOCK, round));
-DEFINE(vp8_block_quant, offsetof(BLOCK, quant));
-DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast));
-DEFINE(vp8_block_zbin_extra, offsetof(BLOCK, zbin_extra));
-DEFINE(vp8_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp8_block_quant_shift, offsetof(BLOCK, quant_shift));
-
-DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff));
-DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant));
-DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
-DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob));
-
-/* subtract */
-DEFINE(vp8_block_base_src, offsetof(BLOCK, base_src));
-DEFINE(vp8_block_src, offsetof(BLOCK, src));
-DEFINE(vp8_block_src_diff, offsetof(BLOCK, src_diff));
-DEFINE(vp8_block_src_stride, offsetof(BLOCK, src_stride));
-
-DEFINE(vp8_blockd_predictor, offsetof(BLOCKD, predictor));
-
-/* pack tokens */
-DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue));
-DEFINE(vp8_writer_range, offsetof(vp8_writer, range));
-DEFINE(vp8_writer_count, offsetof(vp8_writer, count));
-DEFINE(vp8_writer_pos, offsetof(vp8_writer, pos));
-DEFINE(vp8_writer_buffer, offsetof(vp8_writer, buffer));
-DEFINE(vp8_writer_buffer_end, offsetof(vp8_writer, buffer_end));
-DEFINE(vp8_writer_error, offsetof(vp8_writer, error));
-
-DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token));
-DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree, offsetof(TOKENEXTRA, context_tree));
-DEFINE(tokenextra_skip_eob_node, offsetof(TOKENEXTRA, skip_eob_node));
-DEFINE(TOKENEXTRA_SZ, sizeof(TOKENEXTRA));
-
-DEFINE(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct));
-
-DEFINE(vp8_token_value, offsetof(vp8_token, value));
-DEFINE(vp8_token_len, offsetof(vp8_token, Len));
-
-DEFINE(vp8_extra_bit_struct_tree, offsetof(vp8_extra_bit_struct, tree));
-DEFINE(vp8_extra_bit_struct_prob, offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_len, offsetof(vp8_extra_bit_struct, Len));
-DEFINE(vp8_extra_bit_struct_base_val, offsetof(vp8_extra_bit_struct, base_val));
-
-DEFINE(vp8_comp_tplist, offsetof(VP8_COMP, tplist));
-DEFINE(vp8_comp_common, offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc , offsetof(VP8_COMP, bc));
-DEFINE(vp8_writer_sz , sizeof(vp8_writer));
-
-DEFINE(tokenlist_start, offsetof(TOKENLIST, start));
-DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop));
-DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST));
-
-DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code
- * add asserts for any size that is not supported by assembly code
-
- * These are used in vp8cx_pack_tokens. They are hard coded so if their sizes
- * change they will have to be adjusted.
- */
-
-#if HAVE_EDSP
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
index 3a4cf7ee792..101d646ef43 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -121,12 +121,12 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
if (abs_sum_diff > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising),
- // checK if we can still apply some (weaker) temporal filtering to
+ // check if we can still apply some (weaker) temporal filtering to
// this block, that would otherwise not be denoised at all. Simplest
// is to apply an additional adjustment to running_avg_y to bring it
// closer to sig. The adjustment is capped by a maximum delta, and
// chosen such that in most cases the resulting sum_diff will be
- // within the accceptable range given by sum_diff_thresh.
+ // within the acceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over the
// threshold.
@@ -302,12 +302,12 @@ int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
if (abs_sum_diff > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising),
- // checK if we can still apply some (weaker) temporal filtering to
+ // check if we can still apply some (weaker) temporal filtering to
// this block, that would otherwise not be denoised at all. Simplest
// is to apply an additional adjustment to running_avg_y to bring it
// closer to sig. The adjustment is capped by a maximum delta, and
// chosen such that in most cases the resulting sum_diff will be
- // within the accceptable range given by sum_diff_thresh.
+ // within the acceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over the
// threshold.
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c
index 291d21992fe..b4e92e04b22 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c
@@ -35,10 +35,10 @@
void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
char eob = 0;
- short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *zbin_boost_ptr;
short *qcoeff_ptr = d->qcoeff;
- DECLARE_ALIGNED_ARRAY(16, short, x, 16);
- DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+ DECLARE_ALIGNED(16, short, x[16]);
+ DECLARE_ALIGNED(16, short, y[16]);
__m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
__m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
@@ -55,7 +55,7 @@ void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
__m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
__m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
- vpx_memset(qcoeff_ptr, 0, 32);
+ memset(qcoeff_ptr, 0, 32);
/* Duplicate to all lanes. */
zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
index 9b11c0da329..b4c814075c7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
@@ -15,6 +15,7 @@ VP8_COMMON_SRCS-yes += common/onyxd.h
VP8_COMMON_SRCS-yes += common/alloccommon.c
VP8_COMMON_SRCS-yes += common/blockd.c
VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP8_COMMON_SRCS-yes += common/copy_c.c
VP8_COMMON_SRCS-yes += common/debugmodes.c
VP8_COMMON_SRCS-yes += common/default_coef_probs.h
VP8_COMMON_SRCS-yes += common/dequantize.c
@@ -60,7 +61,6 @@ VP8_COMMON_SRCS-yes += common/quant_common.c
VP8_COMMON_SRCS-yes += common/reconinter.c
VP8_COMMON_SRCS-yes += common/reconintra.c
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
-VP8_COMMON_SRCS-yes += common/sad_c.c
VP8_COMMON_SRCS-yes += common/setupintrarecon.c
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
VP8_COMMON_SRCS-yes += common/variance_c.c
@@ -85,26 +85,23 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@@ -148,7 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
@@ -170,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
index b1b079cb260..af9cc7320b9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
@@ -10,7 +10,9 @@
#include "./vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
#include "vpx/vpx_codec.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx_version.h"
@@ -37,6 +39,7 @@ struct vp8_extracfg
vp8e_tuning tuning;
unsigned int cq_level; /* constrained quality level */
unsigned int rc_max_intra_bitrate_pct;
+ unsigned int screen_content_mode;
};
@@ -62,6 +65,7 @@ static struct vp8_extracfg default_extracfg = {
0, /* tuning*/
10, /* cq_level */
0, /* rc_max_intra_bitrate_pct */
+ 0, /* screen_content_mode */
};
struct vpx_codec_alg_priv
@@ -79,6 +83,7 @@ struct vpx_codec_alg_priv
/* pkt_list size depends on the maximum number of lagged frames allowed. */
vpx_codec_pkt_list_decl(64) pkt_list;
unsigned int fixed_kf_cntr;
+ vpx_enc_frame_flags_t control_frame_flags;
};
@@ -194,6 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6);
RANGE_CHECK(vp8_cfg, arnr_type, 1, 3);
RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+ RANGE_CHECK_BOOL(vp8_cfg, screen_content_mode);
if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
RANGE_CHECK(vp8_cfg, cq_level,
cfg->rc_min_quantizer, cfg->rc_max_quantizer);
@@ -231,7 +237,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(cfg, ts_periodicity, 16);
for (i=1; i<cfg->ts_number_layers; i++)
- if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1])
+ if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1] &&
+ cfg->rc_target_bitrate > 0)
ERROR("ts_target_bitrate entries are not strictly increasing");
RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
@@ -360,9 +367,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
if (oxcf->number_of_layers > 1)
{
memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
- sizeof(cfg.ts_target_bitrate));
+ sizeof(cfg.ts_target_bitrate));
memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
- sizeof(cfg.ts_rate_decimator));
+ sizeof(cfg.ts_rate_decimator));
memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
}
@@ -379,6 +386,8 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info;
}
+#else
+ (void)mr_cfg;
#endif
oxcf->cpu_used = vp8_cfg.cpu_used;
@@ -397,6 +406,8 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
oxcf->tuning = vp8_cfg.tuning;
+ oxcf->screen_content_mode = vp8_cfg.screen_content_mode;
+
/*
printf("Current VP8 Settings: \n");
printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -438,9 +449,14 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
{
vpx_codec_err_t res;
- if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
- && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
- ERROR("Cannot change width or height after initialization");
+ if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+ {
+ if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+ ERROR("Cannot change width or height after initialization");
+ if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+ (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+ ERROR("Cannot increast width or height larger than their initial values");
+ }
/* Prevent increasing lag_in_frames. This check is stricter than it needs
* to be -- the limit is not increasing past the first lag_in_frames
@@ -586,6 +602,15 @@ static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx,
return update_extracfg(ctx, &extra_cfg);
}
+static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+ struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+ extra_cfg.screen_content_mode =
+ CAST(VP8E_SET_SCREEN_CONTENT_MODE, args);
+ return update_extracfg(ctx, &extra_cfg);
+}
+
static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
void **mem_loc)
{
@@ -612,6 +637,9 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
*mem_loc = (void *)shared_mem_loc;
res = VPX_CODEC_OK;
}
+#else
+ (void)cfg;
+ (void)mem_loc;
#endif
return res;
}
@@ -623,6 +651,8 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
vp8_rtcd();
+ vpx_dsp_rtcd();
+ vpx_scale_rtcd();
if (!ctx->priv)
{
@@ -768,27 +798,9 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
}
}
-
-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned long duration,
- vpx_enc_frame_flags_t flags,
- unsigned long deadline)
+static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
+ int flags)
{
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- if (!ctx->cfg.rc_target_bitrate)
- return res;
-
- if (img)
- res = validate_img(ctx, img);
-
- if (!res)
- res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
-
- pick_quickcompress_mode(ctx, duration, deadline);
- vpx_codec_pkt_list_init(&ctx->pkt_list);
/* Handle Flags */
if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
@@ -838,6 +850,39 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
vp8_update_entropy(ctx->cpi, 0);
}
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned long duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned long deadline)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ if (!ctx->cfg.rc_target_bitrate)
+ return res;
+
+ if (img)
+ res = validate_img(ctx, img);
+
+ if (!res)
+ res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
+ pick_quickcompress_mode(ctx, duration, deadline);
+ vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+ // If no flags are set in the encode call, then use the frame flags as
+ // defined via the control function: vp8e_set_frame_flags.
+ if (!flags) {
+ flags = ctx->control_frame_flags;
+ }
+ ctx->control_frame_flags = 0;
+
+ res = set_reference_and_update(ctx, flags);
+
/* Handle fixed keyframe intervals */
if (ctx->cfg.kf_mode == VPX_KF_AUTO
&& ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist)
@@ -1140,6 +1185,25 @@ static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+ int frame_flags = va_arg(args, int);
+ ctx->control_frame_flags = frame_flags;
+ return set_reference_and_update(ctx, frame_flags);
+}
+
+static vpx_codec_err_t vp8e_set_temporal_layer_id(vpx_codec_alg_priv_t *ctx,
+ va_list args)
+{
+ int layer_id = va_arg(args, int);
+ if (layer_id < 0 || layer_id >= (int)ctx->cfg.ts_number_layers) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ ctx->cpi->temporal_layer_id = layer_id;
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
va_list args)
{
@@ -1214,6 +1278,8 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
{VP8E_UPD_ENTROPY, vp8e_update_entropy},
{VP8E_UPD_REFERENCE, vp8e_update_reference},
{VP8E_USE_REFERENCE, vp8e_use_reference},
+ {VP8E_SET_FRAME_FLAGS, vp8e_set_frame_flags},
+ {VP8E_SET_TEMPORAL_LAYER_ID, vp8e_set_temporal_layer_id},
{VP8E_SET_ROI_MAP, vp8e_set_roi_map},
{VP8E_SET_ACTIVEMAP, vp8e_set_activemap},
{VP8E_SET_SCALEMODE, vp8e_set_scalemode},
@@ -1231,6 +1297,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
{VP8E_SET_TUNING, set_tuning},
{VP8E_SET_CQ_LEVEL, set_cq_level},
{VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct},
+ {VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode},
{ -1, NULL},
};
@@ -1264,10 +1331,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
30, /* rc_resize_up_thresold */
VPX_VBR, /* rc_end_usage */
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
{0}, /* rc_twopass_stats_in */
{0}, /* rc_firstpass_mb_stats_in */
-#endif
256, /* rc_target_bandwidth */
4, /* rc_min_quantizer */
63, /* rc_max_quantizer */
@@ -1287,9 +1352,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
0, /* kf_min_dist */
128, /* kf_max_dist */
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
- "vp8.fpf" /* first pass filename */
-#endif
VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
{0},
{0}, /* ss_target_bitrate */
@@ -1320,12 +1382,13 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
NULL, /* vpx_codec_get_si_fn_t get_si; */
NULL, /* vpx_codec_decode_fn_t decode; */
NULL, /* vpx_codec_frame_get_fn_t frame_get; */
+ NULL, /* vpx_codec_set_fb_fn_t set_fb_fn; */
},
{
1, /* 1 cfg map */
- vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
+ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t cfg_maps; */
vp8e_encode, /* vpx_codec_encode_fn_t encode; */
- vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
+ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t get_cx_data; */
vp8e_set_config,
NULL,
vp8e_get_preview,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
index 5aa274dbb0b..72e4770c008 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
@@ -11,7 +11,9 @@
#include <stdlib.h>
#include <string.h>
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
#include "vpx/vpx_decoder.h"
#include "vpx/vp8dx.h"
#include "vpx/internal/vpx_codec_internal.h"
@@ -60,7 +62,6 @@ struct vpx_codec_alg_priv
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
vpx_image_t img;
- int flushed;
int img_setup;
struct frame_buffers yv12_frame_buffers;
void *user_priv;
@@ -75,6 +76,7 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
* known)
*/
(void)si;
+ (void)flags;
return sizeof(vpx_codec_alg_priv_t);
}
@@ -89,7 +91,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
priv->si.sz = sizeof(priv->si);
priv->decrypt_cb = NULL;
priv->decrypt_state = NULL;
- priv->flushed = 0;
if (ctx->config.dec)
{
@@ -107,6 +108,8 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
(void) data;
vp8_rtcd();
+ vpx_dsp_rtcd();
+ vpx_scale_rtcd();
/* This function only allocates space for the vpx_codec_alg_priv_t
* structure. More memory may be required at the time the stream
@@ -189,7 +192,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
/* vet via sync code */
if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
- res = VPX_CODEC_UNSUP_BITSTREAM;
+ return VPX_CODEC_UNSUP_BITSTREAM;
si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
@@ -287,8 +290,8 @@ update_fragments(vpx_codec_alg_priv_t *ctx,
if (ctx->fragments.count == 0)
{
/* New frame, reset fragment pointers and sizes */
- vpx_memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
- vpx_memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
+ memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
+ memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
}
if (ctx->fragments.enabled && !(data == NULL && data_sz == 0))
{
@@ -307,6 +310,11 @@ update_fragments(vpx_codec_alg_priv_t *ctx,
return 0;
}
+ if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+ {
+ return 0;
+ }
+
if (!ctx->fragments.enabled)
{
ctx->fragments.ptrs[0] = data;
@@ -327,14 +335,11 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
unsigned int resolution_change = 0;
unsigned int w, h;
- if (data == NULL && data_sz == 0) {
- ctx->flushed = 1;
- return VPX_CODEC_OK;
+ if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+ {
+ return 0;
}
- /* Reset flushed when receiving a valid frame */
- ctx->flushed = 0;
-
/* Update the input fragment data */
if(update_fragments(ctx, data, data_sz, &res) <= 0)
return res;
@@ -401,7 +406,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
if (!res)
{
VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
- if(resolution_change)
+ if (resolution_change)
{
VP8_COMMON *const pc = & pbi->common;
MACROBLOCKD *const xd = & pbi->mb;
@@ -647,6 +652,8 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_INVALID_PARAM;
#else
+ (void)ctx;
+ (void)args;
return VPX_CODEC_INCAPABLE;
#endif
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
index a0dbdcfa92f..5e4ef05987e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
@@ -75,7 +75,6 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
VP8_CX_SRCS-yes += encoder/temporal_filter.c
VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
-VP8_CX_SRCS-yes += encoder/vp8_asm_enc_offsets.c
ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
@@ -107,6 +106,3 @@ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
endif
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
-
-$(eval $(call asm_offsets_template,\
- vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/vp8_asm_enc_offsets.c))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
index ed19fd4e1d9..05003017982 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
@@ -14,30 +14,17 @@ VP8_CX_SRCS-$(ARCH_ARM) += vp8cx_arm.mk
#File list for arm
# encoder
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
-
-#File list for edsp
-# encoder
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS_REMOVE-$(HAVE_EDSP) += encoder/boolhuff.c
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
#File list for media
# encoder
-VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
# encoder
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
-
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
new file mode 100644
index 00000000000..dd569d348fb
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vp9_convolve8_avg_horiz_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ uint8_t *s, *d;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ src += 7;
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w;
+ width > 0;
+ width -= 4, src += 4, dst += 4) { // loop_horiz
+ s = src;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(src + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(src + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(src + 64 + src_stride * 2);
+
+ d = dst;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(src + 64 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ d = dst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ src += src_stride * 4 - w - 7;
+ dst += dst_stride * 4 - w;
+ }
+ return;
+}
+
+void vp9_convolve8_avg_vert_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ uint8_t *s, *d;
+ uint8x8_t d2u8, d3u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ uint8x16_t q1u8, q3u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+ d -= dst_stride * 3;
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
index 6b20cb9bf2a..4d85846f0a9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
@@ -78,7 +78,7 @@
mov r10, r6 ; w loop counter
-loop_horiz_v
+vp9_convolve8_avg_loop_horiz_v
vld1.8 {d24}, [r0], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d26}, [r0], r1
@@ -101,7 +101,7 @@ loop_horiz_v
add r0, r0, #3
-loop_horiz
+vp9_convolve8_avg_loop_horiz
add r5, r0, #64
vld1.32 {d28[]}, [r0], r1
@@ -170,14 +170,14 @@ loop_horiz
vmov q9, q13
subs r6, r6, #4 ; w -= 4
- bgt loop_horiz
+ bgt vp9_convolve8_avg_loop_horiz
; outer loop
mov r6, r10 ; restore w counter
add r0, r0, r9 ; src += src_stride * 4 - w
add r2, r2, r12 ; dst += dst_stride * 4 - w
subs r7, r7, #4 ; h -= 4
- bgt loop_horiz_v
+ bgt vp9_convolve8_avg_loop_horiz_v
pop {r4-r10, pc}
@@ -203,7 +203,7 @@ loop_horiz
lsl r1, r1, #1
lsl r3, r3, #1
-loop_vert_h
+vp9_convolve8_avg_loop_vert_h
mov r4, r0
add r7, r0, r1, asr #1
mov r5, r2
@@ -223,7 +223,7 @@ loop_vert_h
vmovl.u8 q10, d20
vmovl.u8 q11, d22
-loop_vert
+vp9_convolve8_avg_loop_vert
; always process a 4x4 block at a time
vld1.u32 {d24[0]}, [r7], r1
vld1.u32 {d26[0]}, [r4], r1
@@ -288,13 +288,13 @@ loop_vert
vmov d22, d25
subs r12, r12, #4 ; h -= 4
- bgt loop_vert
+ bgt vp9_convolve8_avg_loop_vert
; outer loop
add r0, r0, #4
add r2, r2, #4
subs r6, r6, #4 ; w -= 4
- bgt loop_vert_h
+ bgt vp9_convolve8_avg_loop_vert_h
pop {r4-r8, pc}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c
new file mode 100644
index 00000000000..5c555c45882
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vp9_convolve8_horiz_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ uint8_t *s, *d, *psrc, *pdst;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4,
+ src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+ __builtin_prefetch(src + src_stride * 6);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w, psrc = src + 7, pdst = dst;
+ width > 0;
+ width -= 4, psrc += 4, pdst += 4) { // loop_horiz
+ s = psrc;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(psrc + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(psrc + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+ d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+ d = pdst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ }
+ return;
+}
+
+void vp9_convolve8_vert_neon(
+ uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ uint8_t *s, *d;
+ uint32x2_t d2u32, d3u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+ d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
index 45258454cab..184c3ad679c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
@@ -78,7 +78,7 @@
mov r10, r6 ; w loop counter
-loop_horiz_v
+vp9_convolve8_loop_horiz_v
vld1.8 {d24}, [r0], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d26}, [r0], r1
@@ -101,7 +101,7 @@ loop_horiz_v
add r0, r0, #3
-loop_horiz
+vp9_convolve8_loop_horiz
add r5, r0, #64
vld1.32 {d28[]}, [r0], r1
@@ -159,14 +159,14 @@ loop_horiz
vmov q9, q13
subs r6, r6, #4 ; w -= 4
- bgt loop_horiz
+ bgt vp9_convolve8_loop_horiz
; outer loop
mov r6, r10 ; restore w counter
add r0, r0, r9 ; src += src_stride * 4 - w
add r2, r2, r12 ; dst += dst_stride * 4 - w
subs r7, r7, #4 ; h -= 4
- bgt loop_horiz_v
+ bgt vp9_convolve8_loop_horiz_v
pop {r4-r10, pc}
@@ -192,7 +192,7 @@ loop_horiz
lsl r1, r1, #1
lsl r3, r3, #1
-loop_vert_h
+vp9_convolve8_loop_vert_h
mov r4, r0
add r7, r0, r1, asr #1
mov r5, r2
@@ -212,7 +212,7 @@ loop_vert_h
vmovl.u8 q10, d20
vmovl.u8 q11, d22
-loop_vert
+vp9_convolve8_loop_vert
; always process a 4x4 block at a time
vld1.u32 {d24[0]}, [r7], r1
vld1.u32 {d26[0]}, [r4], r1
@@ -266,13 +266,13 @@ loop_vert
vmov d22, d25
subs r12, r12, #4 ; h -= 4
- bgt loop_vert
+ bgt vp9_convolve8_loop_vert
; outer loop
add r0, r0, #4
add r2, r2, #4
subs r6, r6, #4 ; w -= 4
- bgt loop_vert_h
+ bgt vp9_convolve8_loop_vert_h
pop {r4-r8, pc}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c
new file mode 100644
index 00000000000..3a3db353e8b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve_avg_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8_t *d;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint32x2_t d0u32, d2u32;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ d = dst;
+ if (w > 32) { // avg64
+ for (; h > 0; h -= 1) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ q10u8 = vld1q_u8(d + 32);
+ q11u8 = vld1q_u8(d + 48);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // avg32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+ q10u8 = vld1q_u8(d);
+ q11u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // avg16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(d);
+ d += dst_stride;
+ q3u8 = vld1q_u8(d);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // avg8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d1u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(d);
+ d += dst_stride;
+ d3u8 = vld1_u8(d);
+ d += dst_stride;
+
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+ vst1_u8(dst, vget_low_u8(q0u8));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(q0u8));
+ dst += dst_stride;
+ }
+ } else { // avg4
+ for (; h > 0; h -= 2) {
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+ src += src_stride;
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+ src += src_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+ vreinterpret_u8_u32(d2u32));
+
+ d0u32 = vreinterpret_u32_u8(d0u8);
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm
index 7d245302177..7d245302177 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
index f0881b5ae9c..2e28cb20ebd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -20,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
*/
- DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
// Account for the vertical phase needing 3 lines prior and 4 lines post
int intermediate_height = h + 7;
@@ -56,7 +56,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
- DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
int intermediate_height = h + 7;
if (x_step_q4 != 16 || y_step_q4 != 16) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c
new file mode 100644
index 00000000000..f334abe1130
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve_copy_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8x8_t d0u8, d2u8;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ if (w > 32) { // copy64
+ for (; h > 0; h--) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // copy32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // copy16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // copy8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, d0u8);
+ dst += dst_stride;
+ vst1_u8(dst, d2u8);
+ dst += dst_stride;
+ }
+ } else { // copy4
+ for (; h > 0; h--) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon_asm.asm
index a0bd04a35f0..a0bd04a35f0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
deleted file mode 100644
index 60a0d98c56c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp9_dc_only_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-; uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0 int input_dc
-; r1 uint8_t *pred_ptr
-; r2 uint8_t *dst_ptr
-; r3 int pitch
-; sp int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- mul r0, r0, r12 ; input_dc * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; ROUND_POWER_OF_TWO(out, 4)
- add r0, r0, #8 ; + (1 <<((4) - 1))
- asr r0, r0, #4 ; >> 4
-
- vdup.16 q0, r0; ; duplicate a1
- ldr r12, [sp] ; load stride
-
- vld1.32 {d2[0]}, [r1], r3
- vld1.32 {d2[1]}, [r1], r3
- vld1.32 {d4[0]}, [r1], r3
- vld1.32 {d4[1]}, [r1]
-
- vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c]
- vaddw.u8 q2, q0, d4
-
- vqmovun.s16 d2, q1 ; clip_pixel
- vqmovun.s16 d4, q2
-
- vst1.32 {d2[0]}, [r2], r12
- vst1.32 {d2[1]}, [r2], r12
- vst1.32 {d4[0]}, [r2], r12
- vst1.32 {d4[1]}, [r2]
-
- bx lr
- ENDP ; |vp9_dc_only_idct_add_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c
new file mode 100644
index 00000000000..3c8c6a9348d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct16x16_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, j, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ for (d1 = d2 = dest, i = 0; i < 4; i++) {
+ for (j = 0; j < 2; j++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm
index b1fd21bb61f..b1fd21bb61f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
new file mode 100644
index 00000000000..5fa3f5c017b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
@@ -0,0 +1,1332 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+void vp9_idct16x16_256_add_neon_pass1(
+ int16_t *in,
+ int16_t *out,
+ int output_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d18s16, d1s16);
+ q6s32 = vmull_s16(d19s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+ q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q5s32, 14);
+ d15s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ q2s32 = vmull_s16(d26s16, d2s16);
+ q3s32 = vmull_s16(d27s16, d2s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q15s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+ d10s16 = vqrshrn_n_s32(q2s32, 14);
+ d11s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q15s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d30s16);
+ q11s32 = vmull_s16(d17s16, d30s16);
+ q0s32 = vmull_s16(d24s16, d30s16);
+ q1s32 = vmull_s16(d25s16, d30s16);
+
+ d30s16 = vdup_n_s16(cospi_24_64);
+ d31s16 = vdup_n_s16(cospi_8_64);
+
+ q3s32 = vaddq_s32(q2s32, q0s32);
+ q12s32 = vaddq_s32(q11s32, q1s32);
+ q13s32 = vsubq_s32(q2s32, q0s32);
+ q1s32 = vsubq_s32(q11s32, q1s32);
+
+ d16s16 = vqrshrn_n_s32(q3s32, 14);
+ d17s16 = vqrshrn_n_s32(q12s32, 14);
+ d18s16 = vqrshrn_n_s32(q13s32, 14);
+ d19s16 = vqrshrn_n_s32(q1s32, 14);
+ q8s16 = vcombine_s16(d16s16, d17s16);
+ q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q0s32 = vmull_s16(d20s16, d31s16);
+ q1s32 = vmull_s16(d21s16, d31s16);
+ q12s32 = vmull_s16(d20s16, d30s16);
+ q13s32 = vmull_s16(d21s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+ q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+ d22s16 = vqrshrn_n_s32(q0s32, 14);
+ d23s16 = vqrshrn_n_s32(q1s32, 14);
+ d20s16 = vqrshrn_n_s32(q12s32, 14);
+ d21s16 = vqrshrn_n_s32(q13s32, 14);
+ q10s16 = vcombine_s16(d20s16, d21s16);
+ q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q15s16 = vaddq_s16(q6s16, q7s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ // stage 5
+ q0s16 = vaddq_s16(q8s16, q11s16);
+ q1s16 = vaddq_s16(q9s16, q10s16);
+ q2s16 = vsubq_s16(q9s16, q10s16);
+ q3s16 = vsubq_s16(q8s16, q11s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q11s32 = vmull_s16(d26s16, d16s16);
+ q12s32 = vmull_s16(d27s16, d16s16);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q13s32 = vsubq_s32(q10s32, q12s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d11s16 = vqrshrn_n_s32(q13s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q8s16 = vaddq_s16(q0s16, q15s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d16u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vp9_idct16x16_256_add_neon_pass2(
+ int16_t *src,
+ int16_t *out,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d;
+ uint8x8_t d12u8, d13u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64;
+ int64x1_t d12s64, d13s64;
+ uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+ uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d12s16 = vdup_n_s16(cospi_30_64);
+ d13s16 = vdup_n_s16(cospi_2_64);
+
+ q2s32 = vmull_s16(d16s16, d12s16);
+ q3s32 = vmull_s16(d17s16, d12s16);
+ q1s32 = vmull_s16(d16s16, d13s16);
+ q4s32 = vmull_s16(d17s16, d13s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+ q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+ q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+ d0s16 = vqrshrn_n_s32(q2s32, 14);
+ d1s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q1s32, 14);
+ d15s16 = vqrshrn_n_s32(q4s32, 14);
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_14_64);
+ d31s16 = vdup_n_s16(cospi_18_64);
+
+ q2s32 = vmull_s16(d24s16, d30s16);
+ q3s32 = vmull_s16(d25s16, d30s16);
+ q4s32 = vmull_s16(d24s16, d31s16);
+ q5s32 = vmull_s16(d25s16, d31s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q2s32, 14);
+ d3s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q4s32, 14);
+ d13s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(cospi_22_64);
+ d31s16 = vdup_n_s16(cospi_10_64);
+
+ q11s32 = vmull_s16(d20s16, d30s16);
+ q12s32 = vmull_s16(d21s16, d30s16);
+ q4s32 = vmull_s16(d20s16, d31s16);
+ q5s32 = vmull_s16(d21s16, d31s16);
+
+ q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d11s16 = vqrshrn_n_s32(q5s32, 14);
+ d10s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ d30s16 = vdup_n_s16(cospi_6_64);
+ d31s16 = vdup_n_s16(cospi_26_64);
+
+ q10s32 = vmull_s16(d28s16, d30s16);
+ q11s32 = vmull_s16(d29s16, d30s16);
+ q12s32 = vmull_s16(d28s16, d31s16);
+ q13s32 = vmull_s16(d29s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+ q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+ q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+ q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q11s32, 14);
+ d8s16 = vqrshrn_n_s32(q12s32, 14);
+ d9s16 = vqrshrn_n_s32(q13s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 3
+ q9s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q10s16 = vsubq_s16(q3s16, q2s16);
+ q11s16 = vaddq_s16(q2s16, q3s16);
+ q12s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q6s16, q7s16);
+
+ // stage 4
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q2s32 = vmull_s16(d18s16, d31s16);
+ q3s32 = vmull_s16(d19s16, d31s16);
+ q4s32 = vmull_s16(d28s16, d31s16);
+ q5s32 = vmull_s16(d29s16, d31s16);
+
+ q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+ q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q3s32, 14);
+ d2s16 = vqrshrn_n_s32(q4s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q3s16 = q11s16;
+ q4s16 = q12s16;
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q11s32 = vmull_s16(d26s16, d30s16);
+ q12s32 = vmull_s16(d27s16, d30s16);
+ q8s32 = vmull_s16(d20s16, d30s16);
+ q9s32 = vmull_s16(d21s16, d30s16);
+
+ q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q10s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q10s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ if (skip_adding != 0) {
+ d = dest;
+ // load the data in pass1
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ // store the data out 8,9,10,11,12,13,14,15
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q2s16 = vrshrq_n_s16(q2s16, 6);
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q3s16 = vrshrq_n_s16(q3s16, 6);
+ q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+ q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q14s16 = vrshrq_n_s16(q14s16, 6);
+ q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ q15s16 = vrshrq_n_s16(q15s16, 6);
+ q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ } else { // skip_adding_dest
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+ }
+ return;
+}
+
+void vp9_idct16x16_10_add_neon_pass1(
+ int16_t *in,
+ int16_t *out,
+ int output_stride) {
+ int16x4_t d4s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // stage 3
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ // stage 4
+ q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+ d4s16 = vdup_n_s16(cospi_16_64);
+
+ q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+ q9s32 = vmull_s16(d14s16, d4s16);
+ q10s32 = vmull_s16(d15s16, d4s16);
+ q12s32 = vmull_s16(d9s16, d4s16);
+ q11s32 = vmull_s16(d8s16, d4s16);
+
+ q15s32 = vsubq_s32(q10s32, q12s32);
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d11s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q2s16 = vaddq_s16(q8s16, q7s16);
+ q9s16 = vaddq_s16(q8s16, q6s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q8s16, q4s16);
+ q12s16 = vsubq_s16(q8s16, q4s16);
+ q13s16 = vsubq_s16(q8s16, q5s16);
+ q14s16 = vsubq_s16(q8s16, q6s16);
+ q15s16 = vsubq_s16(q8s16, q7s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d4u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vp9_idct16x16_10_add_neon_pass2(
+ int16_t *src,
+ int16_t *out,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+ uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+ (void)skip_adding;
+ (void)dest;
+ (void)dest_stride;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // stage 3
+ q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+ q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+ q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+ q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+ q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+ q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+ q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+ q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+ // stage 4
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d6s16 = vget_low_s16(q3s16);
+ d7s16 = vget_high_s16(q3s16);
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q12s32 = vmull_s16(d14s16, d31s16);
+ q5s32 = vmull_s16(d15s16, d31s16);
+ q2s32 = vmull_s16(d0s16, d31s16);
+ q11s32 = vmull_s16(d1s16, d31s16);
+
+ q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+ q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q12s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q11s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q10s32 = vmull_s16(d8s16, d30s16);
+ q13s32 = vmull_s16(d9s16, d30s16);
+ q8s32 = vmull_s16(d6s16, d30s16);
+ q9s32 = vmull_s16(d7s16, d30s16);
+
+ q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q10s32, 14);
+ d5s16 = vqrshrn_n_s32(q13s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q0s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q0s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+ d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+ d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+ d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+ d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+ d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ vst1_u64((uint64_t *)out, d16u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d4u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d6u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d7u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d8u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d9u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d10u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d11u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
index a13c0d04b83..a13c0d04b83 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 0b9fc09abd7..f2c4ec4518c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -30,18 +30,24 @@ void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
uint8_t *dest,
int dest_stride);
+#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void vp9_push_neon(int64_t *store);
extern void vp9_pop_neon(int64_t *store);
+#endif // HAVE_NEON_ASM
void vp9_idct16x16_256_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
int64_t store_reg[8];
+#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
+#if HAVE_NEON_ASM
// save d8-d15 register values.
vp9_push_neon(store_reg);
+#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -103,20 +109,26 @@ void vp9_idct16x16_256_add_neon(const int16_t *input,
dest+8,
dest_stride);
+#if HAVE_NEON_ASM
// restore d8-d15 register values.
vp9_pop_neon(store_reg);
+#endif
return;
}
void vp9_idct16x16_10_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
int64_t store_reg[8];
+#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
+#if HAVE_NEON_ASM
// save d8-d15 register values.
vp9_push_neon(store_reg);
+#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -165,8 +177,10 @@ void vp9_idct16x16_10_add_neon(const int16_t *input,
dest+8,
dest_stride);
+#if HAVE_NEON_ASM
// restore d8-d15 register values.
vp9_pop_neon(store_reg);
+#endif
return;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
new file mode 100644
index 00000000000..d0e4b4f4014
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+#include "./vpx_config.h"
+
+static INLINE void LD_16x8(
+ uint8_t *d,
+ int d_stride,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vld1q_u8(d);
+ d += d_stride;
+ *q9u8 = vld1q_u8(d);
+ d += d_stride;
+ *q10u8 = vld1q_u8(d);
+ d += d_stride;
+ *q11u8 = vld1q_u8(d);
+ d += d_stride;
+ *q12u8 = vld1q_u8(d);
+ d += d_stride;
+ *q13u8 = vld1q_u8(d);
+ d += d_stride;
+ *q14u8 = vld1q_u8(d);
+ d += d_stride;
+ *q15u8 = vld1q_u8(d);
+ return;
+}
+
+static INLINE void ADD_DIFF_16x8(
+ uint8x16_t qdiffu8,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static INLINE void SUB_DIFF_16x8(
+ uint8x16_t qdiffu8,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static INLINE void ST_16x8(
+ uint8_t *d,
+ int d_stride,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ vst1q_u8(d, *q8u8);
+ d += d_stride;
+ vst1q_u8(d, *q9u8);
+ d += d_stride;
+ vst1q_u8(d, *q10u8);
+ d += d_stride;
+ vst1q_u8(d, *q11u8);
+ d += d_stride;
+ vst1q_u8(d, *q12u8);
+ d += d_stride;
+ vst1q_u8(d, *q13u8);
+ d += d_stride;
+ vst1q_u8(d, *q14u8);
+ d += d_stride;
+ vst1q_u8(d, *q15u8);
+ return;
+}
+
+void vp9_idct32x32_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int i, j, dest_stride8;
+ uint8_t *d;
+ int16_t a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ dest_stride8 = dest_stride * 8;
+ if (a1 >= 0) { // diff_positive_32_32
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ } else { // diff_negative_32_32
+ a1 = -a1;
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm
index d290d07531c..d290d07531c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
new file mode 100644
index 00000000000..309bdf8d756
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static int16_t cospi_1_64 = 16364;
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_3_64 = 16207;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_5_64 = 15893;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_7_64 = 15426;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_9_64 = 14811;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_11_64 = 14053;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_13_64 = 13160;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_15_64 = 12140;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_17_64 = 11003;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_19_64 = 9760;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_21_64 = 8423;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_23_64 = 7005;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_25_64 = 5520;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_27_64 = 3981;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_29_64 = 2404;
+static int16_t cospi_30_64 = 1606;
+static int16_t cospi_31_64 = 804;
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+ q14s16 = vld1q_s16(trans_buf + first * 8); \
+ q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+ qA = vld1q_s16(out + first * 32); \
+ qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+ vst1q_s16(out + first * 32, qA); \
+ vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
+ q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
+ uint8_t *p1,
+ uint8_t *p2,
+ int stride,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16) {
+ int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+ d8s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d11s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d9s16 = vld1_s16((int16_t *)p1);
+ d10s16 = vld1_s16((int16_t *)p2);
+
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+
+ q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+ vreinterpret_u8_s16(d9s16)));
+ q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_s16(d10s16)));
+ q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_s16(d11s16)));
+ q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+ vreinterpret_u8_s16(d8s16)));
+
+ d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+ d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+ d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+ vst1_s16((int16_t *)p1, d9s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d10s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p1, d8s16);
+ vst1_s16((int16_t *)p2, d11s16);
+ return;
+}
+
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
+ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
+ q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
+ uint8_t *p1,
+ uint8_t *p2,
+ int stride,
+ int16x8_t q4s16,
+ int16x8_t q5s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16) {
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+ d4s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d7s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d5s16 = vld1_s16((int16_t *)p1);
+ d6s16 = vld1_s16((int16_t *)p2);
+
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+
+ q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
+ vreinterpret_u8_s16(d5s16)));
+ q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+ vreinterpret_u8_s16(d6s16)));
+ q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+ vreinterpret_u8_s16(d7s16)));
+ q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
+ vreinterpret_u8_s16(d4s16)));
+
+ d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+ d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+ d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+ vst1_s16((int16_t *)p1, d5s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d6s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p2, d7s16);
+ vst1_s16((int16_t *)p1, d4s16);
+ return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(
+ int16x8_t q14s16,
+ int16x8_t q13s16,
+ int16_t first_const,
+ int16_t second_const,
+ int16x8_t *qAs16,
+ int16x8_t *qBs16) {
+ int16x4_t d30s16, d31s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+ int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+ dCs16 = vget_low_s16(q14s16);
+ dDs16 = vget_high_s16(q14s16);
+ dAs16 = vget_low_s16(q13s16);
+ dBs16 = vget_high_s16(q13s16);
+
+ d30s16 = vdup_n_s16(first_const);
+ d31s16 = vdup_n_s16(second_const);
+
+ q8s32 = vmull_s16(dCs16, d30s16);
+ q10s32 = vmull_s16(dAs16, d31s16);
+ q9s32 = vmull_s16(dDs16, d30s16);
+ q11s32 = vmull_s16(dBs16, d31s16);
+ q12s32 = vmull_s16(dCs16, d31s16);
+
+ q8s32 = vsubq_s32(q8s32, q10s32);
+ q9s32 = vsubq_s32(q9s32, q11s32);
+
+ q10s32 = vmull_s16(dDs16, d31s16);
+ q11s32 = vmull_s16(dAs16, d30s16);
+ q15s32 = vmull_s16(dBs16, d30s16);
+
+ q11s32 = vaddq_s32(q12s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q15s32);
+
+ *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
+ vqrshrn_n_s32(q9s32, 14));
+ *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
+ vqrshrn_n_s32(q10s32, 14));
+ return;
+}
+
+static INLINE void idct32_transpose_pair(
+ int16_t *input,
+ int16_t *t_buf) {
+ int16_t *in;
+ int i;
+ const int stride = 32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ in = input;
+ q8s16 = vld1q_s16(in);
+ in += stride;
+ q9s16 = vld1q_s16(in);
+ in += stride;
+ q10s16 = vld1q_s16(in);
+ in += stride;
+ q11s16 = vld1q_s16(in);
+ in += stride;
+ q12s16 = vld1q_s16(in);
+ in += stride;
+ q13s16 = vld1q_s16(in);
+ in += stride;
+ q14s16 = vld1q_s16(in);
+ in += stride;
+ q15s16 = vld1q_s16(in);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ q12s16 = vcombine_s16(d17s16, d25s16);
+ q13s16 = vcombine_s16(d19s16, d27s16);
+ q14s16 = vcombine_s16(d21s16, d29s16);
+ q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
+ vreinterpretq_s32_s16(q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
+ vreinterpretq_s32_s16(q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
+ vreinterpretq_s32_s16(q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ vst1q_s16(t_buf, q0x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q0x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[1]);
+ t_buf += 8;
+ }
+ return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(
+ int16_t *out,
+ int16x8_t q2s16,
+ int16x8_t q3s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16,
+ int16x8_t q10s16,
+ int16x8_t q11s16,
+ int16x8_t q12s16,
+ int16x8_t q13s16,
+ int16x8_t q14s16,
+ int16x8_t q15s16) {
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+ STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+ STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+ STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+ STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+ STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+ STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+ STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+ STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+ return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+ int16_t *out,
+ uint8_t *dest,
+ int stride,
+ int16x8_t q2s16,
+ int16x8_t q3s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16,
+ int16x8_t q10s16,
+ int16x8_t q11s16,
+ int16x8_t q12s16,
+ int16x8_t q13s16,
+ int16x8_t q14s16,
+ int16x8_t q15s16) {
+ uint8_t *r6 = dest + 31 * stride;
+ uint8_t *r7 = dest/* + 0 * stride*/;
+ uint8_t *r9 = dest + 15 * stride;
+ uint8_t *r10 = dest + 16 * stride;
+ int str2 = stride << 1;
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+ LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ return;
+}
+
+void vp9_idct32x32_1024_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ int16_t *out;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+ for (idct32_pass_loop = 0, out = pass1;
+ idct32_pass_loop < 2;
+ idct32_pass_loop++,
+ input = pass1, // the input of pass2 is the result of pass1
+ out = pass2) {
+ for (i = 0;
+ i < 4; i++,
+ input += 32 * 8, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(0, 1, 31)
+ DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(31, 17, 15)
+ DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+ // part of stage 2
+ q4s16 = vaddq_s16(q0s16, q1s16);
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q6s16 = vaddq_s16(q2s16, q3s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+ // generate 18,19,28,29
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(15, 9, 23)
+ DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(23, 25, 7)
+ DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q3s16, q2s16);
+ q3s16 = vaddq_s16(q3s16, q2s16);
+ q14s16 = vsubq_s16(q1s16, q0s16);
+ q2s16 = vaddq_s16(q1s16, q0s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+ // part of stage 4
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q15s16 = vaddq_s16(q6s16, q3s16);
+ q13s16 = vsubq_s16(q5s16, q0s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+ STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+ STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q4s16, q2s16);
+ q14s16 = vsubq_s16(q6s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+ STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(7, 5, 27)
+ DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(27, 21, 11)
+ DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+ // generate 22,23,24,25
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(11, 13, 19)
+ DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(19, 29, 3)
+ DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+ // part of stage 2
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+ // part of stage 4
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q11s16 = vaddq_s16(q5s16, q0s16);
+ q12s16 = vaddq_s16(q6s16, q2s16);
+ q15s16 = vaddq_s16(q4s16, q3s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q11s16);
+ q9s16 = vaddq_s16(q13s16, q10s16);
+ q13s16 = vsubq_s16(q13s16, q10s16);
+ q11s16 = vsubq_s16(q14s16, q11s16);
+ STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+ LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+ q8s16 = vsubq_s16(q9s16, q12s16);
+ q10s16 = vaddq_s16(q14s16, q15s16);
+ q14s16 = vsubq_s16(q14s16, q15s16);
+ q12s16 = vaddq_s16(q9s16, q12s16);
+ STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+ q13s16 = q11s16;
+ q14s16 = q8s16;
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+ // part of stage 4
+ q14s16 = vsubq_s16(q5s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q2s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ q13s16 = vsubq_s16(q4s16, q3s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q1s16);
+ q9s16 = vaddq_s16(q13s16, q6s16);
+ q13s16 = vsubq_s16(q13s16, q6s16);
+ q1s16 = vsubq_s16(q14s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+ LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+ q14s16 = vsubq_s16(q8s16, q5s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q9s16, q0s16);
+ q0s16 = vsubq_s16(q9s16, q0s16);
+ STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+ DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
+ &q1s16, &q0s16);
+ STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(3, 2, 30)
+ DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(30, 18, 14)
+ DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+ // part of stage 3
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+ // generate 10,11,12,13
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(14, 10, 22)
+ DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(22, 26, 6)
+ DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+ // part of stage 3
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+ // part of stage 5
+ q8s16 = vaddq_s16(q0s16, q5s16);
+ q9s16 = vaddq_s16(q1s16, q7s16);
+ q13s16 = vsubq_s16(q1s16, q7s16);
+ q14s16 = vsubq_s16(q3s16, q4s16);
+ q10s16 = vaddq_s16(q3s16, q4s16);
+ q15s16 = vaddq_s16(q2s16, q6s16);
+ STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+ STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+ // part of stage 6
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+ q13s16 = vsubq_s16(q0s16, q5s16);
+ q14s16 = vsubq_s16(q2s16, q6s16);
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ LOAD_FROM_TRANSPOSED(6, 4, 28)
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(28, 20, 12)
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+ // generate 0,1,2,3
+ // part of stage 4
+ LOAD_FROM_TRANSPOSED(12, 0, 16)
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(16, 8, 24)
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+ // part of stage 5
+ q4s16 = vaddq_s16(q7s16, q6s16);
+ q7s16 = vsubq_s16(q7s16, q6s16);
+ q6s16 = vsubq_s16(q5s16, q14s16);
+ q5s16 = vaddq_s16(q5s16, q14s16);
+ // part of stage 6
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q3s16);
+ q10s16 = vaddq_s16(q6s16, q1s16);
+ q11s16 = vaddq_s16(q7s16, q0s16);
+ q12s16 = vsubq_s16(q7s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q1s16);
+ q14s16 = vsubq_s16(q5s16, q3s16);
+ q15s16 = vsubq_s16(q4s16, q2s16);
+ // part of stage 7
+ LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+ q2s16 = vaddq_s16(q8s16, q1s16);
+ q3s16 = vaddq_s16(q9s16, q0s16);
+ q4s16 = vsubq_s16(q9s16, q0s16);
+ q5s16 = vsubq_s16(q8s16, q1s16);
+ LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out,
+ q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride,
+ q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+ dest += 8;
+ }
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm
index 72e933eee96..72e933eee96 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c
new file mode 100644
index 00000000000..7c8a930b645
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct4x4_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d6u8;
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint16x8_t q8u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ q0s16 = vdupq_n_s16(a1);
+
+ // dc_only_idct_add
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+ d1 += dest_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
+ vreinterpret_u8_u32(d2u32));
+ d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+ d2 += dest_stride;
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm
index 0d4a721c4d3..0d4a721c4d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c
new file mode 100644
index 00000000000..dc91e0f3027
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp9_idct4x4_16_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d26u8, d27u8;
+ uint32x2_t d26u32, d27u32;
+ uint16x8_t q8u16, q9u16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+ int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+ int16x8_t q8s16, q9s16, q13s16, q14s16;
+ int32x4_t q1s32, q13s32, q14s32, q15s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+ uint8_t *d;
+ int16_t cospi_8_64 = 15137;
+ int16_t cospi_16_64 = 11585;
+ int16_t cospi_24_64 = 6270;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ d20s16 = vdup_n_s16(cospi_8_64);
+ d21s16 = vdup_n_s16(cospi_16_64);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ d22s16 = vdup_n_s16(cospi_24_64);
+
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_high_s16(q9s16); // vswp d18 d19
+ d19s16 = vget_low_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ // do the transform on columns
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d = dest;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+ d += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ d = dest;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm
index 00283fc8d78..00283fc8d78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
new file mode 100644
index 00000000000..24c29fb77f6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct8x8_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d5u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
index 421d202d403..421d202d403 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
new file mode 100644
index 00000000000..2b3c1ce6065
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_28_64 = 3196;
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static INLINE void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+void vp9_idct8x8_64_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
+
+void vp9_idct8x8_12_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // First transform rows
+ // stage 1
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+ q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+ q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+ q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+ // stage 2 & stage 3 - even half
+ q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+ q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+ q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+ q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+ // stage 3 -odd half
+ q0s16 = vaddq_s16(q9s16, q15s16);
+ q1s16 = vaddq_s16(q9s16, q13s16);
+ q2s16 = vsubq_s16(q9s16, q13s16);
+ q3s16 = vsubq_s16(q9s16, q15s16);
+
+ // stage 2 - odd half
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ q8s16 = vaddq_s16(q0s16, q7s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q7s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm
index ab5bb69202a..ab5bb69202a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e24c9e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht4x4_16_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
- ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
- ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
- ; into d16-d19 registers. This macro will touch q10- q15 registers and use
- ; them as buffer during calculation.
- MACRO
- IDCT4x4_1D
- ; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
- vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64
- vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64
- vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64
- vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
- ; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q10, #14
-
- ; stage 2
- ; output[0] = step[0] + step[3];
- ; output[1] = step[1] + step[2];
- ; output[3] = step[0] - step[3];
- ; output[2] = step[1] - step[2];
- vadd.s16 q8, q13, q14
- vsub.s16 q9, q13, q14
- vswp d18, d19
- MEND
-
- ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
- ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
- ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
- ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
- ; q14,q15 registers and use them as buffer during calculation.
- MACRO
- IADST4x4_1D
- vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0
- vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0
- vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1
- vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2
- vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2
- vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit
- vaddw.s16 q15, q15, d19 ; x0 + x3
- vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3
- vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2
- vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3
-
- vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5
- vadd.s32 q10, q10, q8
- vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6
- vdup.32 q8, r0 ; duplicate sinpi_3_9
- vsub.s32 q11, q11, q9
- vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7
-
- vadd.s32 q13, q10, q12 ; s0 = x0 + x3
- vadd.s32 q10, q10, q11 ; x0 + x1
- vadd.s32 q14, q11, q12 ; s1 = x1 + x3
- vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3
-
- ; dct_const_round_shift
- vqrshrn.s32 d16, q13, #14
- vqrshrn.s32 d17, q14, #14
- vqrshrn.s32 d18, q15, #14
- vqrshrn.s32 d19, q10, #14
- MEND
-
- ; Generate cosine constants in d6 - d8 for the IDCT
- MACRO
- GENERATE_COSINE_CONSTANTS
- ; cospi_8_64 = 15137 = 0x3b21
- mov r0, #0x3b00
- add r0, #0x21
- ; cospi_16_64 = 11585 = 0x2d41
- mov r3, #0x2d00
- add r3, #0x41
- ; cospi_24_64 = 6270 = 0x187e
- mov r12, #0x1800
- add r12, #0x7e
-
- ; generate constant vectors
- vdup.16 d0, r0 ; duplicate cospi_8_64
- vdup.16 d1, r3 ; duplicate cospi_16_64
- vdup.16 d2, r12 ; duplicate cospi_24_64
- MEND
-
- ; Generate sine constants in d1 - d4 for the IADST.
- MACRO
- GENERATE_SINE_CONSTANTS
- ; sinpi_1_9 = 5283 = 0x14A3
- mov r0, #0x1400
- add r0, #0xa3
- ; sinpi_2_9 = 9929 = 0x26C9
- mov r3, #0x2600
- add r3, #0xc9
- ; sinpi_4_9 = 15212 = 0x3B6C
- mov r12, #0x3b00
- add r12, #0x6c
-
- ; generate constant vectors
- vdup.16 d3, r0 ; duplicate sinpi_1_9
-
- ; sinpi_3_9 = 13377 = 0x3441
- mov r0, #0x3400
- add r0, #0x41
-
- vdup.16 d4, r3 ; duplicate sinpi_2_9
- vdup.16 d5, r12 ; duplicate sinpi_4_9
- vdup.16 q3, r0 ; duplicate sinpi_3_9
- MEND
-
- ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
- MACRO
- TRANSPOSE4X4
- vtrn.16 d16, d17
- vtrn.16 d18, d19
- vtrn.32 q8, q9
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
-
- ; transpose the input data
- TRANSPOSE4X4
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IDCT4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IDCT4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
- ; generate constants
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
- ; ROUND_POWER_OF_TWO(temp_out[j], 4)
- vrshr.s16 q8, q8, #4
- vrshr.s16 q9, q9, #4
-
- vld1.32 {d26[0]}, [r1], r2
- vld1.32 {d26[1]}, [r1], r2
- vld1.32 {d27[0]}, [r1], r2
- vld1.32 {d27[1]}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d26
- vaddw.u8 q9, q9, d27
-
- ; clip_pixel
- vqmovun.s16 d26, q8
- vqmovun.s16 d27, q9
-
- ; do the stores in reverse order with negative post-increment, by changing
- ; the sign of the stride
- rsb r2, r2, #0
- vst1.32 {d27[1]}, [r1], r2
- vst1.32 {d27[0]}, [r1], r2
- vst1.32 {d26[1]}, [r1], r2
- vst1.32 {d26[0]}, [r1] ; no post-increment
- bx lr
- ENDP ; |vp9_iht4x4_16_add_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 00000000000..1761fada2fa
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int32x4_t q8s32, q9s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+
+ d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+ d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+ q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+ q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+ q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+ *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+ *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+ return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16) {
+ *d0s16 = vdup_n_s16(cospi_8_64);
+ *d1s16 = vdup_n_s16(cospi_16_64);
+ *d2s16 = vdup_n_s16(cospi_24_64);
+ return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16) {
+ *d3s16 = vdup_n_s16(sinpi_1_9);
+ *d4s16 = vdup_n_s16(sinpi_2_9);
+ *q3s16 = vdupq_n_s16(sinpi_3_9);
+ *d5s16 = vdup_n_s16(sinpi_4_9);
+ return;
+}
+
+static INLINE void IDCT4x4_1D(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ int32x4_t q10s32, q13s32, q14s32, q15s32;
+ int16x8_t q13s16, q14s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, *d2s16);
+ q10s32 = vmull_s16(d17s16, *d0s16);
+ q13s32 = vmull_s16(d23s16, *d1s16);
+ q14s32 = vmull_s16(d24s16, *d1s16);
+ q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+ q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+ *q8s16 = vaddq_s16(q13s16, q14s16);
+ *q9s16 = vsubq_s16(q13s16, q14s16);
+ *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+ vget_low_s16(*q9s16)); // vswp
+ return;
+}
+
+static INLINE void IADST4x4_1D(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d6s16 = vget_low_s16(*q3s16);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ q10s32 = vmull_s16(*d3s16, d16s16);
+ q11s32 = vmull_s16(*d4s16, d16s16);
+ q12s32 = vmull_s16(d6s16, d17s16);
+ q13s32 = vmull_s16(*d5s16, d18s16);
+ q14s32 = vmull_s16(*d3s16, d18s16);
+ q15s32 = vmovl_s16(d16s16);
+ q15s32 = vaddw_s16(q15s32, d19s16);
+ q8s32 = vmull_s16(*d4s16, d19s16);
+ q15s32 = vsubw_s16(q15s32, d18s16);
+ q9s32 = vmull_s16(*d5s16, d19s16);
+
+ q10s32 = vaddq_s32(q10s32, q13s32);
+ q10s32 = vaddq_s32(q10s32, q8s32);
+ q11s32 = vsubq_s32(q11s32, q14s32);
+ q8s32 = vdupq_n_s32(sinpi_3_9);
+ q11s32 = vsubq_s32(q11s32, q9s32);
+ q15s32 = vmulq_s32(q15s32, q8s32);
+
+ q13s32 = vaddq_s32(q10s32, q12s32);
+ q10s32 = vaddq_s32(q10s32, q11s32);
+ q14s32 = vaddq_s32(q11s32, q12s32);
+ q10s32 = vsubq_s32(q10s32, q12s32);
+
+ d16s16 = vqrshrn_n_s32(q13s32, 14);
+ d17s16 = vqrshrn_n_s32(q14s32, 14);
+ d18s16 = vqrshrn_n_s32(q15s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+ *q8s16 = vcombine_s16(d16s16, d17s16);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ uint8x8_t d26u8, d27u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+ uint32x2_t d26u32, d27u32;
+ int16x8_t q3s16, q8s16, q9s16;
+ uint16x8_t q8u16, q9u16;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate constants
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ case 2: // idct_iadst
+ // generate constantsyy
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+ break;
+ case 3: // iadst_iadst
+ // generate constants
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+ dest += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f5661b80..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null
@@ -1,698 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht8x8_64_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Generate IADST constants in r0 - r12 for the IADST.
- MACRO
- GENERATE_IADST_CONSTANTS
- ; generate cospi_2_64 = 16305
- mov r0, #0x3f00
- add r0, #0xb1
-
- ; generate cospi_30_64 = 1606
- mov r1, #0x600
- add r1, #0x46
-
- ; generate cospi_10_64 = 14449
- mov r2, #0x3800
- add r2, #0x71
-
- ; generate cospi_22_64 = 7723
- mov r3, #0x1e00
- add r3, #0x2b
-
- ; generate cospi_18_64 = 10394
- mov r4, #0x2800
- add r4, #0x9a
-
- ; generate cospi_14_64 = 12665
- mov r5, #0x3100
- add r5, #0x79
-
- ; generate cospi_26_64 = 4756
- mov r6, #0x1200
- add r6, #0x94
-
- ; generate cospi_6_64 = 15679
- mov r7, #0x3d00
- add r7, #0x3f
-
- ; generate cospi_8_64 = 15137
- mov r8, #0x3b00
- add r8, #0x21
-
- ; generate cospi_24_64 = 6270
- mov r9, #0x1800
- add r9, #0x7e
-
- ; generate 0
- mov r10, #0
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
- MEND
-
- ; Generate IDCT constants in r3 - r9 for the IDCT.
- MACRO
- GENERATE_IDCT_CONSTANTS
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
-
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
-
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
- MEND
-
- ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
- ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
- ; will be stored back into q8-q15 registers. This macro will touch q0-q7
- ; registers and use them as buffer during calculation.
- MACRO
- IDCT8x8_1D
- ; stage 1
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r4 ; duplicate cospi_4_64
- vdup.16 d2, r5 ; duplicate cospi_12_64
- vdup.16 d3, r6 ; duplicate cospi_20_64
-
- ; input[1] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; input[5] * cospi_12_64
- vmull.s16 q5, d26, d2
- vmull.s16 q6, d27, d2
-
- ; input[1]*cospi_28_64-input[7]*cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q5, d22, d3
- vmlsl.s16 q6, d23, d3
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; input[1] * cospi_4_64
- vmull.s16 q2, d18, d1
- vmull.s16 q3, d19, d1
-
- ; input[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q13, d27, d3
-
- ; input[1]*cospi_4_64+input[7]*cospi_28_64
- vmlal.s16 q2, d30, d0
- vmlal.s16 q3, d31, d0
-
- ; input[5] * cospi_20_64 + input[3] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q13, d23, d2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d14, q2, #14 ; >> 14
- vqrshrn.s32 d15, q3, #14 ; >> 14
-
- ; stage 2 & stage 3 - even half
- vdup.16 d0, r7 ; duplicate cospi_16_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q13, #14 ; >> 14
-
- ; input[0] * cospi_16_64
- vmull.s16 q2, d16, d0
- vmull.s16 q3, d17, d0
-
- ; input[0] * cospi_16_64
- vmull.s16 q13, d16, d0
- vmull.s16 q15, d17, d0
-
- ; (input[0] + input[2]) * cospi_16_64
- vmlal.s16 q2, d24, d0
- vmlal.s16 q3, d25, d0
-
- ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q13, d24, d0
- vmlsl.s16 q15, d25, d0
-
- vdup.16 d0, r8 ; duplicate cospi_24_64
- vdup.16 d1, r9 ; duplicate cospi_8_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d18, q2, #14 ; >> 14
- vqrshrn.s32 d19, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d22, q13, #14 ; >> 14
- vqrshrn.s32 d23, q15, #14 ; >> 14
-
- ; input[1] * cospi_24_64
- vmull.s16 q2, d20, d0
- vmull.s16 q3, d21, d0
-
- ; input[1] * cospi_8_64
- vmull.s16 q8, d20, d1
- vmull.s16 q12, d21, d1
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q2, d28, d1
- vmlsl.s16 q3, d29, d1
-
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q8, d28, d0
- vmlal.s16 q12, d29, d0
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d26, q2, #14 ; >> 14
- vqrshrn.s32 d27, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d30, q8, #14 ; >> 14
- vqrshrn.s32 d31, q12, #14 ; >> 14
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
- MEND
-
- ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
- ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
- ; output will be stored back into q8-q15 registers. This macro will touch
- ; q0 - q7 registers and use them as buffer during calculation.
- MACRO
- IADST8X8_1D
- vdup.16 d14, r0 ; duplicate cospi_2_64
- vdup.16 d15, r1 ; duplicate cospi_30_64
-
- ; cospi_2_64 * x0
- vmull.s16 q1, d30, d14
- vmull.s16 q2, d31, d14
-
- ; cospi_30_64 * x0
- vmull.s16 q3, d30, d15
- vmull.s16 q4, d31, d15
-
- vdup.16 d30, r4 ; duplicate cospi_18_64
- vdup.16 d31, r5 ; duplicate cospi_14_64
-
- ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- vmlal.s16 q1, d16, d15
- vmlal.s16 q2, d17, d15
-
- ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1
- vmlsl.s16 q3, d16, d14
- vmlsl.s16 q4, d17, d14
-
- ; cospi_18_64 * x4
- vmull.s16 q5, d22, d30
- vmull.s16 q6, d23, d30
-
- ; cospi_14_64 * x4
- vmull.s16 q7, d22, d31
- vmull.s16 q8, d23, d31
-
- ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- vmlal.s16 q5, d24, d31
- vmlal.s16 q6, d25, d31
-
- ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
- vmlsl.s16 q7, d24, d30
- vmlsl.s16 q8, d25, d30
-
- ; (s0 + s4)
- vadd.s32 q11, q1, q5
- vadd.s32 q12, q2, q6
-
- vdup.16 d0, r2 ; duplicate cospi_10_64
- vdup.16 d1, r3 ; duplicate cospi_22_64
-
- ; (s0 - s4)
- vsub.s32 q1, q1, q5
- vsub.s32 q2, q2, q6
-
- ; x0 = dct_const_round_shift(s0 + s4);
- vqrshrn.s32 d22, q11, #14 ; >> 14
- vqrshrn.s32 d23, q12, #14 ; >> 14
-
- ; (s1 + s5)
- vadd.s32 q12, q3, q7
- vadd.s32 q15, q4, q8
-
- ; (s1 - s5)
- vsub.s32 q3, q3, q7
- vsub.s32 q4, q4, q8
-
- ; x4 = dct_const_round_shift(s0 - s4);
- vqrshrn.s32 d2, q1, #14 ; >> 14
- vqrshrn.s32 d3, q2, #14 ; >> 14
-
- ; x1 = dct_const_round_shift(s1 + s5);
- vqrshrn.s32 d24, q12, #14 ; >> 14
- vqrshrn.s32 d25, q15, #14 ; >> 14
-
- ; x5 = dct_const_round_shift(s1 - s5);
- vqrshrn.s32 d6, q3, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
-
- ; cospi_10_64 * x2
- vmull.s16 q4, d26, d0
- vmull.s16 q5, d27, d0
-
- ; cospi_22_64 * x2
- vmull.s16 q2, d26, d1
- vmull.s16 q6, d27, d1
-
- vdup.16 d30, r6 ; duplicate cospi_26_64
- vdup.16 d31, r7 ; duplicate cospi_6_64
-
- ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- vmlal.s16 q4, d20, d1
- vmlal.s16 q5, d21, d1
-
- ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- vmlsl.s16 q2, d20, d0
- vmlsl.s16 q6, d21, d0
-
- ; cospi_26_64 * x6
- vmull.s16 q0, d18, d30
- vmull.s16 q13, d19, d30
-
- ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- vmlal.s16 q0, d28, d31
- vmlal.s16 q13, d29, d31
-
- ; cospi_6_64 * x6
- vmull.s16 q10, d18, d31
- vmull.s16 q9, d19, d31
-
- ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- vmlsl.s16 q10, d28, d30
- vmlsl.s16 q9, d29, d30
-
- ; (s3 + s7)
- vadd.s32 q14, q2, q10
- vadd.s32 q15, q6, q9
-
- ; (s3 - s7)
- vsub.s32 q2, q2, q10
- vsub.s32 q6, q6, q9
-
- ; x3 = dct_const_round_shift(s3 + s7);
- vqrshrn.s32 d28, q14, #14 ; >> 14
- vqrshrn.s32 d29, q15, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s3 - s7);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
-
- ; (s2 + s6)
- vadd.s32 q9, q4, q0
- vadd.s32 q10, q5, q13
-
- ; (s2 - s6)
- vsub.s32 q4, q4, q0
- vsub.s32 q5, q5, q13
-
- vdup.16 d30, r8 ; duplicate cospi_8_64
- vdup.16 d31, r9 ; duplicate cospi_24_64
-
- ; x2 = dct_const_round_shift(s2 + s6);
- vqrshrn.s32 d18, q9, #14 ; >> 14
- vqrshrn.s32 d19, q10, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s2 - s6);
- vqrshrn.s32 d8, q4, #14 ; >> 14
- vqrshrn.s32 d9, q5, #14 ; >> 14
-
- ; cospi_8_64 * x4
- vmull.s16 q5, d2, d30
- vmull.s16 q6, d3, d30
-
- ; cospi_24_64 * x4
- vmull.s16 q7, d2, d31
- vmull.s16 q0, d3, d31
-
- ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- vmlal.s16 q5, d6, d31
- vmlal.s16 q6, d7, d31
-
- ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- vmlsl.s16 q7, d6, d30
- vmlsl.s16 q0, d7, d30
-
- ; cospi_8_64 * x7
- vmull.s16 q1, d4, d30
- vmull.s16 q3, d5, d30
-
- ; cospi_24_64 * x7
- vmull.s16 q10, d4, d31
- vmull.s16 q2, d5, d31
-
- ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- vmlsl.s16 q1, d8, d31
- vmlsl.s16 q3, d9, d31
-
- ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
- vmlal.s16 q10, d8, d30
- vmlal.s16 q2, d9, d30
-
- vadd.s16 q8, q11, q9 ; x0 = s0 + s2;
-
- vsub.s16 q11, q11, q9 ; x2 = s0 - s2;
-
- vadd.s16 q4, q12, q14 ; x1 = s1 + s3;
-
- vsub.s16 q12, q12, q14 ; x3 = s1 - s3;
-
- ; (s4 + s6)
- vadd.s32 q14, q5, q1
- vadd.s32 q15, q6, q3
-
- ; (s4 - s6)
- vsub.s32 q5, q5, q1
- vsub.s32 q6, q6, q3
-
- ; x4 = dct_const_round_shift(s4 + s6);
- vqrshrn.s32 d18, q14, #14 ; >> 14
- vqrshrn.s32 d19, q15, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s4 - s6);
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; (s5 + s7)
- vadd.s32 q1, q7, q10
- vadd.s32 q3, q0, q2
-
- ; (s5 - s7))
- vsub.s32 q7, q7, q10
- vsub.s32 q0, q0, q2
-
- ; x5 = dct_const_round_shift(s5 + s7);
- vqrshrn.s32 d28, q1, #14 ; >> 14
- vqrshrn.s32 d29, q3, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s5 - s7);
- vqrshrn.s32 d14, q7, #14 ; >> 14
- vqrshrn.s32 d15, q0, #14 ; >> 14
-
- vdup.16 d30, r12 ; duplicate cospi_16_64
-
- ; cospi_16_64 * x2
- vmull.s16 q2, d22, d30
- vmull.s16 q3, d23, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q13, d22, d30
- vmull.s16 q1, d23, d30
-
- ; cospi_16_64 * x2 + cospi_16_64 * x3;
- vmlal.s16 q2, d24, d30
- vmlal.s16 q3, d25, d30
-
- ; cospi_16_64 * x2 - cospi_16_64 * x3;
- vmlsl.s16 q13, d24, d30
- vmlsl.s16 q1, d25, d30
-
- ; x2 = dct_const_round_shift(s2);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q3, #14 ; >> 14
-
- ;x3 = dct_const_round_shift(s3);
- vqrshrn.s32 d24, q13, #14 ; >> 14
- vqrshrn.s32 d25, q1, #14 ; >> 14
-
- ; cospi_16_64 * x6
- vmull.s16 q13, d10, d30
- vmull.s16 q1, d11, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q11, d10, d30
- vmull.s16 q0, d11, d30
-
- ; cospi_16_64 * x6 + cospi_16_64 * x7;
- vmlal.s16 q13, d14, d30
- vmlal.s16 q1, d15, d30
-
- ; cospi_16_64 * x6 - cospi_16_64 * x7;
- vmlsl.s16 q11, d14, d30
- vmlsl.s16 q0, d15, d30
-
- ; x6 = dct_const_round_shift(s6);
- vqrshrn.s32 d20, q13, #14 ; >> 14
- vqrshrn.s32 d21, q1, #14 ; >> 14
-
- ;x7 = dct_const_round_shift(s7);
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q0, #14 ; >> 14
-
- vdup.16 q5, r10 ; duplicate 0
-
- vsub.s16 q9, q5, q9 ; output[1] = -x4;
- vsub.s16 q11, q5, q2 ; output[3] = -x2;
- vsub.s16 q13, q5, q6 ; output[5] = -x7;
- vsub.s16 q15, q5, q4 ; output[7] = -x1;
- MEND
-
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
-
- push {r0-r10}
- vpush {d8-d15}
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; first transform rows
- IDCT8x8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; then transform columns
- IADST8X8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; then transform columns
- IDCT8x8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; then transform columns
- IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
- vpop {d8-d15}
- pop {r0-r10}
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
- bx lr
- ENDP ; |vp9_iht8x8_64_add_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 00000000000..04b342c3d34
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static INLINE void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+static INLINE void IADST8X8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q2s16, q4s16, q5s16, q6s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+ int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ d14s16 = vdup_n_s16(cospi_2_64);
+ d15s16 = vdup_n_s16(cospi_30_64);
+
+ q1s32 = vmull_s16(d30s16, d14s16);
+ q2s32 = vmull_s16(d31s16, d14s16);
+ q3s32 = vmull_s16(d30s16, d15s16);
+ q4s32 = vmull_s16(d31s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_18_64);
+ d31s16 = vdup_n_s16(cospi_14_64);
+
+ q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+ q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+ q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+ q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+ q5s32 = vmull_s16(d22s16, d30s16);
+ q6s32 = vmull_s16(d23s16, d30s16);
+ q7s32 = vmull_s16(d22s16, d31s16);
+ q8s32 = vmull_s16(d23s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+ q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+ q11s32 = vaddq_s32(q1s32, q5s32);
+ q12s32 = vaddq_s32(q2s32, q6s32);
+ q1s32 = vsubq_s32(q1s32, q5s32);
+ q2s32 = vsubq_s32(q2s32, q6s32);
+
+ d22s16 = vqrshrn_n_s32(q11s32, 14);
+ d23s16 = vqrshrn_n_s32(q12s32, 14);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q12s32 = vaddq_s32(q3s32, q7s32);
+ q15s32 = vaddq_s32(q4s32, q8s32);
+ q3s32 = vsubq_s32(q3s32, q7s32);
+ q4s32 = vsubq_s32(q4s32, q8s32);
+
+ d2s16 = vqrshrn_n_s32(q1s32, 14);
+ d3s16 = vqrshrn_n_s32(q2s32, 14);
+ d24s16 = vqrshrn_n_s32(q12s32, 14);
+ d25s16 = vqrshrn_n_s32(q15s32, 14);
+ d6s16 = vqrshrn_n_s32(q3s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ d0s16 = vdup_n_s16(cospi_10_64);
+ d1s16 = vdup_n_s16(cospi_22_64);
+ q4s32 = vmull_s16(d26s16, d0s16);
+ q5s32 = vmull_s16(d27s16, d0s16);
+ q2s32 = vmull_s16(d26s16, d1s16);
+ q6s32 = vmull_s16(d27s16, d1s16);
+
+ d30s16 = vdup_n_s16(cospi_26_64);
+ d31s16 = vdup_n_s16(cospi_6_64);
+
+ q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+ q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+ q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+ q0s32 = vmull_s16(d18s16, d30s16);
+ q13s32 = vmull_s16(d19s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+ q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+ q10s32 = vmull_s16(d18s16, d31s16);
+ q9s32 = vmull_s16(d19s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+ q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+ q14s32 = vaddq_s32(q2s32, q10s32);
+ q15s32 = vaddq_s32(q6s32, q9s32);
+ q2s32 = vsubq_s32(q2s32, q10s32);
+ q6s32 = vsubq_s32(q6s32, q9s32);
+
+ d28s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ q9s32 = vaddq_s32(q4s32, q0s32);
+ q10s32 = vaddq_s32(q5s32, q13s32);
+ q4s32 = vsubq_s32(q4s32, q0s32);
+ q5s32 = vsubq_s32(q5s32, q13s32);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ d18s16 = vqrshrn_n_s32(q9s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+ d8s16 = vqrshrn_n_s32(q4s32, 14);
+ d9s16 = vqrshrn_n_s32(q5s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q5s32 = vmull_s16(d2s16, d30s16);
+ q6s32 = vmull_s16(d3s16, d30s16);
+ q7s32 = vmull_s16(d2s16, d31s16);
+ q0s32 = vmull_s16(d3s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+ q1s32 = vmull_s16(d4s16, d30s16);
+ q3s32 = vmull_s16(d5s16, d30s16);
+ q10s32 = vmull_s16(d4s16, d31s16);
+ q2s32 = vmull_s16(d5s16, d31s16);
+
+ q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+ q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+ *q8s16 = vaddq_s16(*q11s16, *q9s16);
+ *q11s16 = vsubq_s16(*q11s16, *q9s16);
+ q4s16 = vaddq_s16(*q12s16, *q14s16);
+ *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+ q14s32 = vaddq_s32(q5s32, q1s32);
+ q15s32 = vaddq_s32(q6s32, q3s32);
+ q5s32 = vsubq_s32(q5s32, q1s32);
+ q6s32 = vsubq_s32(q6s32, q3s32);
+
+ d18s16 = vqrshrn_n_s32(q14s32, 14);
+ d19s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q1s32 = vaddq_s32(q7s32, q10s32);
+ q3s32 = vaddq_s32(q0s32, q2s32);
+ q7s32 = vsubq_s32(q7s32, q10s32);
+ q0s32 = vsubq_s32(q0s32, q2s32);
+
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ d29s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q7s32, 14);
+ d15s16 = vqrshrn_n_s32(q0s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ q2s32 = vmull_s16(d22s16, d30s16);
+ q3s32 = vmull_s16(d23s16, d30s16);
+ q13s32 = vmull_s16(d22s16, d30s16);
+ q1s32 = vmull_s16(d23s16, d30s16);
+
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+ q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q3s32, 14);
+ d24s16 = vqrshrn_n_s32(q13s32, 14);
+ d25s16 = vqrshrn_n_s32(q1s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ q13s32 = vmull_s16(d10s16, d30s16);
+ q1s32 = vmull_s16(d11s16, d30s16);
+ q11s32 = vmull_s16(d10s16, d30s16);
+ q0s32 = vmull_s16(d11s16, d30s16);
+
+ q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+ q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+ d20s16 = vqrshrn_n_s32(q13s32, 14);
+ d21s16 = vqrshrn_n_s32(q1s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q0s32, 14);
+ *q10s16 = vcombine_s16(d20s16, d21s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q5s16 = vdupq_n_s16(0);
+
+ *q9s16 = vsubq_s16(q5s16, *q9s16);
+ *q11s16 = vsubq_s16(q5s16, q2s16);
+ *q13s16 = vsubq_s16(q5s16, q6s16);
+ *q15s16 = vsubq_s16(q5s16, q4s16);
+ return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i;
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 8 * 2);
+ q11s16 = vld1q_s16(input + 8 * 3);
+ q12s16 = vld1q_s16(input + 8 * 4);
+ q13s16 = vld1q_s16(input + 8 * 5);
+ q14s16 = vld1q_s16(input + 8 * 6);
+ q15s16 = vld1q_s16(input + 8 * 7);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // first transform rows
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 2: // idct_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // then transform columns
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 3: // iadst_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ for (d1 = d2 = dest, i = 0; i < 2; i++) {
+ if (i != 0) {
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+ }
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index bc6a17cd16f..c69ee1009f5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,46 +8,172 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <arm_neon.h>
+
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+static INLINE void vp9_loop_filter_neon_16(
+ uint8x16_t qblimit, // blimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vdupq_n_u16(3);
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q4 = vdupq_n_u8(3);
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+ return;
+}
+
+void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
- vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
-}
+ uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+ uint8x16_t qblimit, qlimit, qthresh;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
+ dblimit0 = vld1_u8(blimit0);
+ dlimit0 = vld1_u8(limit0);
+ dthresh0 = vld1_u8(thresh0);
+ dblimit1 = vld1_u8(blimit1);
+ dlimit1 = vld1_u8(limit1);
+ dthresh1 = vld1_u8(thresh1);
+ qblimit = vcombine_u8(dblimit0, dblimit1);
+ qlimit = vcombine_u8(dlimit0, dlimit1);
+ qthresh = vcombine_u8(dthresh0, dthresh1);
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
+ s -= (p << 2);
+
+ q3u8 = vld1q_u8(s);
+ s += p;
+ q4u8 = vld1q_u8(s);
+ s += p;
+ q5u8 = vld1q_u8(s);
+ s += p;
+ q6u8 = vld1q_u8(s);
+ s += p;
+ q7u8 = vld1q_u8(s);
+ s += p;
+ q8u8 = vld1q_u8(s);
+ s += p;
+ q9u8 = vld1q_u8(s);
+ s += p;
+ q10u8 = vld1q_u8(s);
+
+ vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
+ q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+ &q5u8, &q6u8, &q7u8, &q8u8);
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
- vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+ s -= (p * 5);
+ vst1q_u8(s, q5u8);
+ s += p;
+ vst1q_u8(s, q6u8);
+ s += p;
+ vst1q_u8(s, q7u8);
+ s += p;
+ vst1q_u8(s, q8u8);
+ return;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
index 5b8ec20287d..5b8ec20287d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
new file mode 100644
index 00000000000..fd9db6187c6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
+ uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+ vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+ vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
+}
+
+void vp9_lpf_horizontal_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < count; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
new file mode 100644
index 00000000000..7738e0d3abc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_lpf_horizontal_4_neon|
+ EXPORT |vp9_lpf_vertical_4_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_lpf_horizontal_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ cmp r12, #0
+ beq end_vp9_lf_h_edge
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+count_lf_h_loop
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl vp9_loop_filter_neon
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne count_lf_h_loop
+
+end_vp9_lf_h_edge
+ pop {pc}
+ ENDP ; |vp9_lpf_horizontal_4_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_lpf_vertical_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+ cmp r12, #0
+ beq end_vp9_lf_v_edge
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+count_lf_v_loop
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl vp9_loop_filter_neon
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ add r0, r0, r1, lsl #3 ; s += pitch * 8
+ subs r12, r12, #1
+ subne r2, r0, #4 ; move s pointer down by 4 columns
+ bne count_lf_v_loop
+
+end_vp9_lf_v_edge
+ pop {pc}
+ ENDP ; |vp9_lpf_vertical_4_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|vp9_loop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_neon|
+
+ END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
new file mode 100644
index 00000000000..33068a8a203
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_mbloop_filter_neon(
+ uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+
+ d25u8 = vabd_u8(d6u8, d4u8);
+
+ d23u8 = vmax_u8(d23u8, d24u8);
+
+ d26u8 = vabd_u8(d7u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
+
+ d19u8 = vmax_u8(d19u8, d23u8);
+
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+ d19u8 = vcge_u8(dlimit, d19u8);
+
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+
+ d23u8 = vshr_n_u8(d23u8, 1);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+
+ d24u8 = vqadd_u8(d24u8, d23u8);
+
+ d20u8 = vmax_u8(d20u8, d25u8);
+
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+
+ d20u8 = vcge_u8(d23u8, d20u8);
+
+ d19u8 = vand_u8(d19u8, d24u8);
+
+ d23u8 = vcgt_u8(d22u8, dthresh);
+
+ d20u8 = vand_u8(d20u8, d19u8);
+
+ d22u8 = vdup_n_u8(0x80);
+
+ d23u8 = vorr_u8(d21u8, d23u8);
+
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+ vreinterpret_u16_u8(d21u8));
+
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
+
+ d27u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ q15s16 = vaddw_s8(q15s16, d29s8);
+
+ d29u8 = vdup_n_u8(4);
+
+ d28s8 = vqmovn_s16(q15s16);
+
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
+
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
+
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+ q14u16 = vaddw_u8(q14u16, d5u8);
+
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_horizontal_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_mblf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ if (count == 0)
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < count; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
index 4430322171d..91aaec04eaf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
@@ -8,8 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_lpf_horizontal_4_neon|
- EXPORT |vp9_lpf_vertical_4_neon|
EXPORT |vp9_lpf_horizontal_8_neon|
EXPORT |vp9_lpf_vertical_8_neon|
ARM
@@ -21,261 +19,6 @@
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-;
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; sp+4 int count
-|vp9_lpf_horizontal_4_neon| PROC
- push {lr}
-
- vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
- ldr r2, [sp, #4] ; load thresh
- add r1, r1, r1 ; double pitch
-
- cmp r12, #0
- beq end_vp9_lf_h_edge
-
- vld1.8 {d1[]}, [r3] ; duplicate *limit
- vld1.8 {d2[]}, [r2] ; duplicate *thresh
-
-count_lf_h_loop
- sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
- add r3, r2, r1, lsr #1 ; set to 3 lines down
-
- vld1.u8 {d3}, [r2@64], r1 ; p3
- vld1.u8 {d4}, [r3@64], r1 ; p2
- vld1.u8 {d5}, [r2@64], r1 ; p1
- vld1.u8 {d6}, [r3@64], r1 ; p0
- vld1.u8 {d7}, [r2@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r2@64] ; q2
- vld1.u8 {d18}, [r3@64] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r3, r3, r1, lsl #1
-
- bl vp9_loop_filter_neon
-
- vst1.u8 {d4}, [r2@64], r1 ; store op1
- vst1.u8 {d5}, [r3@64], r1 ; store op0
- vst1.u8 {d6}, [r2@64], r1 ; store oq0
- vst1.u8 {d7}, [r3@64], r1 ; store oq1
-
- add r0, r0, #8
- subs r12, r12, #1
- bne count_lf_h_loop
-
-end_vp9_lf_h_edge
- pop {pc}
- ENDP ; |vp9_lpf_horizontal_4_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-;
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; sp+4 int count
-|vp9_lpf_vertical_4_neon| PROC
- push {lr}
-
- vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
- vld1.8 {d1[]}, [r3] ; duplicate *limit
-
- ldr r3, [sp, #4] ; load thresh
- sub r2, r0, #4 ; move s pointer down by 4 columns
- cmp r12, #0
- beq end_vp9_lf_v_edge
-
- vld1.8 {d2[]}, [r3] ; duplicate *thresh
-
-count_lf_v_loop
- vld1.u8 {d3}, [r2], r1 ; load s data
- vld1.u8 {d4}, [r2], r1
- vld1.u8 {d5}, [r2], r1
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d7}, [r2], r1
- vld1.u8 {d16}, [r2], r1
- vld1.u8 {d17}, [r2], r1
- vld1.u8 {d18}, [r2]
-
- ;transpose to 8x16 matrix
- vtrn.32 d3, d7
- vtrn.32 d4, d16
- vtrn.32 d5, d17
- vtrn.32 d6, d18
-
- vtrn.16 d3, d5
- vtrn.16 d4, d6
- vtrn.16 d7, d17
- vtrn.16 d16, d18
-
- vtrn.8 d3, d4
- vtrn.8 d5, d6
- vtrn.8 d7, d16
- vtrn.8 d17, d18
-
- bl vp9_loop_filter_neon
-
- sub r0, r0, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
- vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
- vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
- vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
- vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
- vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
- vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
- vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
- add r0, r0, r1, lsl #3 ; s += pitch * 8
- subs r12, r12, #1
- subne r2, r0, #4 ; move s pointer down by 4 columns
- bne count_lf_v_loop
-
-end_vp9_lf_v_edge
- pop {pc}
- ENDP ; |vp9_lpf_vertical_4_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0 blimit
-; d1 limit
-; d2 thresh
-; d3 p3
-; d4 p2
-; d5 p1
-; d6 p0
-; d7 q0
-; d16 q1
-; d17 q2
-; d18 q3
-;
-; Outputs:
-; d4 op1
-; d5 op0
-; d6 oq0
-; d7 oq1
-|vp9_loop_filter_neon| PROC
- ; filter_mask
- vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
- vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
- vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
- vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
- vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
- vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
-
- ; only compare the largest value to limit
- vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
- vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
-
- vabd.u8 d17, d6, d7 ; abs(p0 - q0)
-
- vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
-
- vmov.u8 d18, #0x80
-
- vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
-
- ; hevmask
- vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
-
- vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
- vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
-
- veor d7, d7, d18 ; qs0
-
- vcge.u8 d23, d1, d23 ; abs(m1) > limit
-
- ; filter() function
- ; convert to signed
-
- vshr.u8 d28, d28, #1 ; a = a / 2
- veor d6, d6, d18 ; ps0
-
- veor d5, d5, d18 ; ps1
- vqadd.u8 d17, d17, d28 ; a = b + a
-
- veor d16, d16, d18 ; qs1
-
- vmov.u8 d19, #3
-
- vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
-
- vcge.u8 d17, d0, d17 ; a > blimit
-
- vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
- vorr d22, d21, d22 ; hevmask
-
- vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
-
- vand d27, d27, d22 ; filter &= hev
- vand d23, d23, d17 ; filter_mask
-
- vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
-
- vmov.u8 d17, #4
-
- ; filter = clamp(filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d27, q12
-
- vand d27, d27, d23 ; filter &= mask
-
- vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
- vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
- vshr.s8 d28, d28, #3 ; filter2 >>= 3
- vshr.s8 d27, d27, #3 ; filter1 >>= 3
-
- vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
- vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
-
- ; outer tap adjustments
- vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
-
- veor d6, d26, d18 ; *oq0 = u^0x80
-
- vbic d27, d27, d22 ; filter &= ~hev
-
- vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
- vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
-
- veor d5, d19, d18 ; *op0 = u^0x80
- veor d4, d21, d18 ; *op1 = u^0x80
- veor d7, d20, d18 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp9_loop_filter_neon|
-
; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
; const uint8_t *blimit,
; const uint8_t *limit,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c
new file mode 100644
index 00000000000..31fcc63ba06
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+#if HAVE_NEON_ASM
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+ vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif // HAVE_NEON_ASM
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c
new file mode 100644
index 00000000000..d0beaa7208f
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_v_predictor_4x4_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint32x2_t d0u32 = vdup_n_u32(0);
+ (void)left;
+
+ d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += y_stride)
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ return;
+}
+
+void vp9_v_predictor_8x8_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ (void)left;
+
+ d0u8 = vld1_u8(above);
+ for (i = 0; i < 8; i++, dst += y_stride)
+ vst1_u8(dst, d0u8);
+ return;
+}
+
+void vp9_v_predictor_16x16_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ for (i = 0; i < 16; i++, dst += y_stride)
+ vst1q_u8(dst, q0u8);
+ return;
+}
+
+void vp9_v_predictor_32x32_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ q1u8 = vld1q_u8(above + 16);
+ for (i = 0; i < 32; i++, dst += y_stride) {
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ }
+ return;
+}
+
+void vp9_h_predictor_4x4_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d1u32 = vdup_n_u32(0);
+ (void)above;
+
+ d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ return;
+}
+
+void vp9_h_predictor_8x8_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint64x1_t d1u64 = vdup_n_u64(0);
+ (void)above;
+
+ d1u64 = vld1_u64((const uint64_t *)left);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+ vst1_u8(dst, d0u8);
+ return;
+}
+
+void vp9_h_predictor_16x16_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ }
+ return;
+}
+
+void vp9_h_predictor_32x32_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j, k;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ for (k = 0; k < 2; k++, left += 16) {
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ }
+ }
+ return;
+}
+
+void vp9_tm_predictor_4x4_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int i;
+ uint16x8_t q1u16, q3u16;
+ int16x8_t q1s16;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+
+ d0u8 = vdup_n_u8(above[-1]);
+ d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+ q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+ for (i = 0; i < 4; i++, dst += y_stride) {
+ q1u16 = vdupq_n_u16((uint16_t)left[i]);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+ vreinterpretq_s16_u16(q3u16));
+ d0u8 = vqmovun_s16(q1s16);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ }
+ return;
+}
+
+void vp9_tm_predictor_8x8_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j;
+ uint16x8_t q0u16, q3u16, q10u16;
+ int16x8_t q0s16;
+ uint16x4_t d20u16;
+ uint8x8_t d0u8, d2u8, d30u8;
+
+ d0u8 = vdup_n_u8(above[-1]);
+ d30u8 = vld1_u8(left);
+ d2u8 = vld1_u8(above);
+ q10u16 = vmovl_u8(d30u8);
+ q3u16 = vsubl_u8(d2u8, d0u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 1);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 3);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ }
+ return;
+}
+
+void vp9_tm_predictor_16x16_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+ uint8x16_t q0u8, q1u8;
+ int16x8_t q0s16, q1s16, q8s16, q11s16;
+ uint16x4_t d20u16;
+ uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ for (k = 0; k < 2; k++, left += 8) {
+ d18u8 = vld1_u8(left);
+ q10u16 = vmovl_u8(d18u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q8u16 = vdupq_lane_u16(d20u16, 1);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q8u16 = vdupq_lane_u16(d20u16, 3);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
+ }
+ }
+ return;
+}
+
+void vp9_tm_predictor_32x32_neon(
+ uint8_t *dst,
+ ptrdiff_t y_stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+ uint8x16_t q0u8, q1u8, q2u8;
+ int16x8_t q12s16, q13s16, q14s16, q15s16;
+ uint16x4_t d6u16;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u8 = vld1q_u8(above + 16);
+ q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+ for (k = 0; k < 4; k++, left += 8) {
+ d26u8 = vld1_u8(left);
+ q3u16 = vmovl_u8(d26u8);
+ d6u16 = vget_low_u16(q3u16);
+ for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+ q0u16 = vdupq_lane_u16(d6u16, 0);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 1);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 2);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 3);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
+ }
+ }
+ return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
index dc9856fa887..dc9856fa887 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index ab18490dce3..17422798c13 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -413,7 +413,7 @@ void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
assert(w <= 64);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 0ef9dd508e5..58b50d2df93 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -950,7 +950,7 @@ void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
- DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
uint32_t pos = 38;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
new file mode 100644
index 00000000000..e2247435e88
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
@@ -0,0 +1,1045 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+
+ PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out2, out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1, out2,
+ out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LOAD_SB(src);
+ src1 = LOAD_SB(src + 8);
+ src += src_stride;
+ src2 = LOAD_SB(src);
+ src3 = LOAD_SB(src + 8);
+ src += src_stride;
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+ dst += dst_stride;
+ PCKEV_B_XORI128_STORE_VEC(out3, out2, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LOAD_SB(src);
+ src2 = LOAD_SB(src + 16);
+ src3 = LOAD_SB(src + 24);
+ src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+ src += src_stride;
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ src0 = LOAD_SB(src);
+ src2 = LOAD_SB(src + 16);
+ src3 = LOAD_SB(src + 24);
+ src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+
+ PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+ PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
+ dst += dst_stride;
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+ PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ for (cnt = 0; cnt < 2; ++cnt) {
+ src0 = LOAD_SB(&src[cnt << 5]);
+ src2 = LOAD_SB(&src[16 + (cnt << 5)]);
+ src3 = LOAD_SB(&src[24 + (cnt << 5)]);
+ src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+
+ XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+
+ out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+ out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+ out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+ out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+ PCKEV_B_XORI128_STORE_VEC(out1, out0, &dst[cnt << 5]);
+ PCKEV_B_XORI128_STORE_VEC(out3, out2, &dst[16 + (cnt << 5)]);
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 vec0, vec1, filt0;
+ v16i8 res0, res1;
+ v8u16 vec2, vec3, filt, const255;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+ vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
+ vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
+
+ vec2 = __msa_dotp_u_h(vec0, filt0);
+ vec3 = __msa_dotp_u_h(vec1, filt0);
+
+ vec2 = (v8u16)__msa_srari_h((v8i16)vec2, FILTER_BITS);
+ vec3 = (v8u16)__msa_srari_h((v8i16)vec3, FILTER_BITS);
+
+ vec2 = __msa_min_u_h(vec2, const255);
+ vec3 = __msa_min_u_h(vec3, const255);
+
+ res0 = __msa_pckev_b((v16i8)vec2, (v16i8)vec2);
+ res1 = __msa_pckev_b((v16i8)vec3, (v16i8)vec3);
+
+ out0 = __msa_copy_u_w((v4i32)res0, 0);
+ out1 = __msa_copy_u_w((v4i32)res0, 1);
+ out2 = __msa_copy_u_w((v4i32)res1, 0);
+ out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t out0, out1, out2, out3;
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 vec0, vec1, vec2, vec3;
+ v8u16 vec4, vec5, vec6, vec7;
+ v16i8 res0, res1, res2, res3;
+ v8u16 filt, const255;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LOAD_8VECS_SB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+
+ vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
+ vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
+ vec2 = (v16u8)__msa_vshf_b(mask, src5, src4);
+ vec3 = (v16u8)__msa_vshf_b(mask, src7, src6);
+
+ vec4 = __msa_dotp_u_h(vec0, filt0);
+ vec5 = __msa_dotp_u_h(vec1, filt0);
+ vec6 = __msa_dotp_u_h(vec2, filt0);
+ vec7 = __msa_dotp_u_h(vec3, filt0);
+
+ vec4 = (v8u16)__msa_srari_h((v8i16)vec4, FILTER_BITS);
+ vec5 = (v8u16)__msa_srari_h((v8i16)vec5, FILTER_BITS);
+ vec6 = (v8u16)__msa_srari_h((v8i16)vec6, FILTER_BITS);
+ vec7 = (v8u16)__msa_srari_h((v8i16)vec7, FILTER_BITS);
+
+ vec4 = __msa_min_u_h(vec4, const255);
+ vec5 = __msa_min_u_h(vec5, const255);
+ vec6 = __msa_min_u_h(vec6, const255);
+ vec7 = __msa_min_u_h(vec7, const255);
+
+ res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
+ res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
+ res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
+ res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
+
+ out0 = __msa_copy_u_w((v4i32)res0, 0);
+ out1 = __msa_copy_u_w((v4i32)res0, 1);
+ out2 = __msa_copy_u_w((v4i32)res1, 0);
+ out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+ dst += dst_stride;
+
+ out0 = __msa_copy_u_w((v4i32)res2, 0);
+ out1 = __msa_copy_u_w((v4i32)res2, 1);
+ out2 = __msa_copy_u_w((v4i32)res3, 0);
+ out3 = __msa_copy_u_w((v4i32)res3, 1);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 out0, out1, out2, out3;
+ v8u16 const255, filt;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+ vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+ vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+ vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+ vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+ SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ out0 = __msa_min_u_h(vec0, const255);
+ out1 = __msa_min_u_h(vec1, const255);
+ out2 = __msa_min_u_h(vec2, const255);
+ out3 = __msa_min_u_h(vec3, const255);
+
+ PCKEV_B_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 filt, const255;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+ vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+ vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+ vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+ SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ vec0 = __msa_min_u_h(vec0, const255);
+ vec1 = __msa_min_u_h(vec1, const255);
+ vec2 = __msa_min_u_h(vec2, const255);
+ vec3 = __msa_min_u_h(vec3, const255);
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+ vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+ vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+ vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+ SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ vec0 = __msa_min_u_h(vec0, const255);
+ vec1 = __msa_min_u_h(vec1, const255);
+ vec2 = __msa_min_u_h(vec2, const255);
+ vec3 = __msa_min_u_h(vec3, const255);
+
+ PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+ vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+ vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+ vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+ SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
+ vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ vec0 = __msa_min_u_h(vec0, const255);
+ vec1 = __msa_min_u_h(vec1, const255);
+ vec2 = __msa_min_u_h(vec2, const255);
+ vec3 = __msa_min_u_h(vec3, const255);
+
+ LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+ vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+ vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+ vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+ SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
+ vec0, vec1, vec2, vec3, FILTER_BITS);
+
+ vec0 = __msa_min_u_h(vec0, const255);
+ vec1 = __msa_min_u_h(vec1, const255);
+ vec2 = __msa_min_u_h(vec2, const255);
+ vec3 = __msa_min_u_h(vec3, const255);
+
+ PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+ }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8u16 filt, const255;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ loop_cnt = (height >> 2) - 1;
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ src0 = LOAD_SB(src);
+ src1 = LOAD_SB(src + 8);
+ src += src_stride;
+ src2 = LOAD_SB(src);
+ src3 = LOAD_SB(src + 8);
+ src += src_stride;
+ src4 = LOAD_SB(src);
+ src5 = LOAD_SB(src + 8);
+ src += src_stride;
+ src6 = LOAD_SB(src);
+ src7 = LOAD_SB(src + 8);
+ src += src_stride;
+
+ vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+ vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+ vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+ vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+ vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+ out0 = __msa_dotp_u_h(vec0, filt0);
+ out1 = __msa_dotp_u_h(vec1, filt0);
+ out2 = __msa_dotp_u_h(vec2, filt0);
+ out3 = __msa_dotp_u_h(vec3, filt0);
+ out4 = __msa_dotp_u_h(vec4, filt0);
+ out5 = __msa_dotp_u_h(vec5, filt0);
+ out6 = __msa_dotp_u_h(vec6, filt0);
+ out7 = __msa_dotp_u_h(vec7, filt0);
+
+ out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+ out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+ out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+ out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+ out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+ out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+ out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+ out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+ out0 = __msa_min_u_h(out0, const255);
+ out1 = __msa_min_u_h(out1, const255);
+ out2 = __msa_min_u_h(out2, const255);
+ out3 = __msa_min_u_h(out3, const255);
+ out4 = __msa_min_u_h(out4, const255);
+ out5 = __msa_min_u_h(out5, const255);
+ out6 = __msa_min_u_h(out6, const255);
+ out7 = __msa_min_u_h(out7, const255);
+
+ PCKEV_B_STORE_VEC(out1, out0, dst);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out3, out2, dst);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out5, out4, dst);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out7, out6, dst);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ src0 = LOAD_SB(src);
+ src1 = LOAD_SB(src + 8);
+ src += src_stride;
+ src2 = LOAD_SB(src);
+ src3 = LOAD_SB(src + 8);
+ src += src_stride;
+ src4 = LOAD_SB(src);
+ src5 = LOAD_SB(src + 8);
+ src += src_stride;
+ src6 = LOAD_SB(src);
+ src7 = LOAD_SB(src + 8);
+ src += src_stride;
+
+ vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+ vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+ vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+ vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+ vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+ out0 = __msa_dotp_u_h(vec0, filt0);
+ out1 = __msa_dotp_u_h(vec1, filt0);
+ out2 = __msa_dotp_u_h(vec2, filt0);
+ out3 = __msa_dotp_u_h(vec3, filt0);
+ out4 = __msa_dotp_u_h(vec4, filt0);
+ out5 = __msa_dotp_u_h(vec5, filt0);
+ out6 = __msa_dotp_u_h(vec6, filt0);
+ out7 = __msa_dotp_u_h(vec7, filt0);
+
+ out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+ out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+ out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+ out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+ out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+ out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+ out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+ out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+ out0 = __msa_min_u_h(out0, const255);
+ out1 = __msa_min_u_h(out1, const255);
+ out2 = __msa_min_u_h(out2, const255);
+ out3 = __msa_min_u_h(out3, const255);
+ out4 = __msa_min_u_h(out4, const255);
+ out5 = __msa_min_u_h(out5, const255);
+ out6 = __msa_min_u_h(out6, const255);
+ out7 = __msa_min_u_h(out7, const255);
+
+ PCKEV_B_STORE_VEC(out1, out0, dst);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out3, out2, dst);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out5, out4, dst);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out7, out6, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8u16 filt, const255;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src0 = LOAD_SB(src);
+ src2 = LOAD_SB(src + 16);
+ src3 = LOAD_SB(src + 24);
+ src1 = __msa_sld_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LOAD_SB(src);
+ src6 = LOAD_SB(src + 16);
+ src7 = LOAD_SB(src + 24);
+ src5 = __msa_sld_b(src6, src4, 8);
+ src += src_stride;
+
+ vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+ vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+ vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+ vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+ vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+ out0 = __msa_dotp_u_h(vec0, filt0);
+ out1 = __msa_dotp_u_h(vec1, filt0);
+ out2 = __msa_dotp_u_h(vec2, filt0);
+ out3 = __msa_dotp_u_h(vec3, filt0);
+ out4 = __msa_dotp_u_h(vec4, filt0);
+ out5 = __msa_dotp_u_h(vec5, filt0);
+ out6 = __msa_dotp_u_h(vec6, filt0);
+ out7 = __msa_dotp_u_h(vec7, filt0);
+
+ out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+ out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+ out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+ out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+ out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+ out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+ out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+ out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+ out0 = __msa_min_u_h(out0, const255);
+ out1 = __msa_min_u_h(out1, const255);
+ out2 = __msa_min_u_h(out2, const255);
+ out3 = __msa_min_u_h(out3, const255);
+ out4 = __msa_min_u_h(out4, const255);
+ out5 = __msa_min_u_h(out5, const255);
+ out6 = __msa_min_u_h(out6, const255);
+ out7 = __msa_min_u_h(out7, const255);
+
+ PCKEV_B_STORE_VEC(out1, out0, dst);
+ PCKEV_B_STORE_VEC(out3, out2, dst + 16);
+ dst += dst_stride;
+ PCKEV_B_STORE_VEC(out5, out4, dst);
+ PCKEV_B_STORE_VEC(out7, out6, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8u16 filt, const255;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LOAD_SB(src);
+ src2 = LOAD_SB(src + 16);
+ src4 = LOAD_SB(src + 32);
+ src6 = LOAD_SB(src + 48);
+ src7 = LOAD_SB(src + 56);
+ src1 = __msa_sld_b(src2, src0, 8);
+ src3 = __msa_sld_b(src4, src2, 8);
+ src5 = __msa_sld_b(src6, src4, 8);
+ src += src_stride;
+
+ vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+ vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+ vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+ vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+ vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+ vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+ vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+ vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+ out0 = __msa_dotp_u_h(vec0, filt0);
+ out1 = __msa_dotp_u_h(vec1, filt0);
+ out2 = __msa_dotp_u_h(vec2, filt0);
+ out3 = __msa_dotp_u_h(vec3, filt0);
+ out4 = __msa_dotp_u_h(vec4, filt0);
+ out5 = __msa_dotp_u_h(vec5, filt0);
+ out6 = __msa_dotp_u_h(vec6, filt0);
+ out7 = __msa_dotp_u_h(vec7, filt0);
+
+ out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+ out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+ out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+ out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+ out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+ out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+ out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+ out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+ out0 = __msa_min_u_h(out0, const255);
+ out1 = __msa_min_u_h(out1, const255);
+ out2 = __msa_min_u_h(out2, const255);
+ out3 = __msa_min_u_h(out3, const255);
+ out4 = __msa_min_u_h(out4, const255);
+ out5 = __msa_min_u_h(out5, const255);
+ out6 = __msa_min_u_h(out6, const255);
+ out7 = __msa_min_u_h(out7, const255);
+
+ PCKEV_B_STORE_VEC(out1, out0, dst);
+ PCKEV_B_STORE_VEC(out3, out2, dst + 16);
+ PCKEV_B_STORE_VEC(out5, out4, dst + 32);
+ PCKEV_B_STORE_VEC(out7, out6, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_hor[8];
+
+ if (16 != x_step_q4) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_x)[1] == 0x800000) {
+ vp9_convolve_copy(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hz_2t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c
new file mode 100644
index 00000000000..d0c374648c9
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt_horiz;
+ v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
+ v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9;
+ v8i16 tmp0, tmp1, out0, out1, out2, out3, out4;
+ v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
+
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt_horiz = LOAD_SH(filter_horiz);
+ filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
+ filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
+ filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
+ filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+ src0, src1, src2, src3, src4, src5, src6, 128);
+
+ horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+ horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+ horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+ horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+ horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
+ horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
+
+ filt = LOAD_SH(filter_vert);
+ filt_vert0 = __msa_splati_h(filt, 0);
+ filt_vert1 = __msa_splati_h(filt, 1);
+ filt_vert2 = __msa_splati_h(filt, 2);
+ filt_vert3 = __msa_splati_h(filt, 3);
+
+ out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+ horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+ horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8);
+
+ out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
+
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
+ filt_vert2, filt_vert3);
+
+ horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+ horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8);
+
+ out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
+
+ tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1,
+ filt_vert2, filt_vert3);
+ tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ horiz_out5 = horiz_out9;
+
+ out0 = out2;
+ out1 = out3;
+ out2 = out4;
+ }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
+ v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+ v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7;
+ v8i16 horiz_out8, horiz_out9, horiz_out10;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+ v8i16 tmp0, tmp1, tmp2, tmp3;
+
+ mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt_horiz = LOAD_SH(filter_horiz);
+ filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
+ filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
+ filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
+ filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+ src0, src1, src2, src3, src4, src5, src6, 128);
+
+ horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+ horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+ horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+ horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+ horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+ horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+ horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+
+ filt = LOAD_SH(filter_vert);
+ filt_vert0 = __msa_splati_h(filt, 0);
+ filt_vert1 = __msa_splati_h(filt, 1);
+ filt_vert2 = __msa_splati_h(filt, 2);
+ filt_vert3 = __msa_splati_h(filt, 3);
+
+ out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
+ out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1);
+ out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3);
+ out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+ horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+
+ out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
+ filt_vert2, filt_vert3);
+ tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
+
+ horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+
+ out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1,
+ filt_vert2, filt_vert3);
+ tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
+
+ horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0,
+ filt_horiz1, filt_horiz2, filt_horiz3);
+
+ out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1,
+ filt_vert2, filt_vert3);
+ tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7);
+
+ horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3,
+ filt_horiz0, filt_horiz1, filt_horiz2,
+ filt_horiz3);
+
+ out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1,
+ filt_vert2, filt_vert3);
+ tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ horiz_out6 = horiz_out10;
+
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ uint32_t out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 res0, res1, horiz_vec;
+ v16u8 filt_vert, filt_horiz, vec0, vec1;
+ v8u16 filt, tmp0, tmp1;
+ v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter_horiz);
+ filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LOAD_UH(filter_vert);
+ filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
+ horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+ horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
+
+ horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
+ horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt_vert);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vert);
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0);
+ res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1);
+
+ out0 = __msa_copy_u_w((v4i32)res0, 0);
+ out1 = __msa_copy_u_w((v4i32)res0, 1);
+ out2 = __msa_copy_u_w((v4i32)res1, 0);
+ out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ uint32_t out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16u8 filt_horiz, filt_vert, horiz_vec;
+ v16u8 vec0, vec1, vec2, vec3;
+ v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+ v8u16 vec4, vec5, vec6, vec7, filt;
+ v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8;
+ v16i8 res0, res1, res2, res3;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LOAD_UH(filter_horiz);
+ filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LOAD_UH(filter_vert);
+ filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LOAD_8VECS_SB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LOAD_SB(src);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
+ horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4);
+ horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6);
+ horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8);
+ horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7);
+
+ horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
+ horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
+ horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8);
+ horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
+
+ vec4 = __msa_dotp_u_h(vec0, filt_vert);
+ vec5 = __msa_dotp_u_h(vec1, filt_vert);
+ vec6 = __msa_dotp_u_h(vec2, filt_vert);
+ vec7 = __msa_dotp_u_h(vec3, filt_vert);
+
+ vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7);
+ vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7);
+ vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7);
+ vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7);
+
+ res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
+ res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
+ res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
+ res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
+
+ out0 = __msa_copy_u_w((v4i32)res0, 0);
+ out1 = __msa_copy_u_w((v4i32)res0, 1);
+ out2 = __msa_copy_u_w((v4i32)res1, 0);
+ out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+ dst += dst_stride;
+
+ out0 = __msa_copy_u_w((v4i32)res2, 0);
+ out1 = __msa_copy_u_w((v4i32)res2, 1);
+ out2 = __msa_copy_u_w((v4i32)res3, 0);
+ out3 = __msa_copy_u_w((v4i32)res3, 1);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_horiz, filt_vert, horiz_vec;
+ v16u8 vec0, vec1, vec2, vec3;
+ v8u16 horiz_out0, horiz_out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter_horiz);
+ filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LOAD_SH(filter_vert);
+ filt_vert = (v16u8)__msa_splati_h(filt, 0);
+
+ LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+ horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+ horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vert);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
+ v8u16 horiz_out0, horiz_out1;
+ v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ v8i16 filt;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter_horiz);
+ filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LOAD_SH(filter_vert);
+ filt_vert = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LOAD_SB(src);
+ src += src_stride;
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+ horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert);
+
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+ horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp4 = __msa_dotp_u_h(vec0, filt_vert);
+
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+ tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+ horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp5 = __msa_dotp_u_h(vec0, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp6 = __msa_dotp_u_h(vec0, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+ horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp7 = __msa_dotp_u_h(vec0, filt_vert);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+ horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp8 = __msa_dotp_u_h(vec0, filt_vert);
+
+ tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
+ tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
+ tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
+ tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
+ v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2;
+ v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+ v8i16 filt;
+
+ mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LOAD_SH(filter_horiz);
+ filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LOAD_SH(filter_vert);
+ filt_vert = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LOAD_SB(src);
+ src1 = LOAD_SB(src + 8);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+ horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+ horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6);
+ LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+ horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+ horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+ dst += dst_stride;
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+ horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+ horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+ dst += dst_stride;
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+ horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5);
+ horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+ dst += dst_stride;
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6);
+ horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+ horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7);
+ horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+ horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t x_step_q4,
+ const int16_t *filter_y, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ if (16 != x_step_q4 || 16 != y_step_q4) {
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_x)[1] == 0x800000 &&
+ ((const int32_t *)filter_y)[1] == 0x800000) {
+ vp9_convolve_copy(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0 &&
+ ((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ default:
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else if (((const int32_t *)filter_x)[0] == 0 ||
+ ((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ default:
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
new file mode 100644
index 00000000000..6b71ec1c0e4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+ v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+ v16i8 src2110, src4332, src6554, src8776, src10998;
+ v8i16 filt, out10, out32;
+ v16i8 filt0, filt1, filt2, filt3;
+
+ src -= (3 * src_stride);
+
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+ src1, src3, src5, src2, src4, src6,
+ src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+
+ ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
+ src6554, src65_r, src54_r);
+
+ XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+ src76_r, src87_r, src98_r, src109_r);
+
+ ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r);
+
+ XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128);
+
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776,
+ filt0, filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998,
+ filt0, filt1, filt2, filt3);
+
+ out10 = SRARI_SATURATE_SIGNED_H(out10, FILTER_BITS, 7);
+ out32 = SRARI_SATURATE_SIGNED_H(out32, FILTER_BITS, 7);
+
+ PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+ v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+ v16i8 filt0, filt1, filt2, filt3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= (3 * src_stride);
+
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+ src0, src1, src2, src3, src4, src5, src6, 128);
+
+ ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+ src1, src3, src5, src2, src4, src6,
+ src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+ ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+ src76_r, src87_r, src98_r, src109_r);
+
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+ filt0, filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+ filt0, filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+ filt0, filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+ filt0, filt1, filt2, filt3);
+
+ out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
+ out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
+ out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
+ out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
+
+ PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height,
+ int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+ v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+ v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+ v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+ v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+ v8i16 filt;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+
+ src -= (3 * src_stride);
+
+ filt = LOAD_SH(filter);
+ filt0 = (v16i8)__msa_splati_h(filt, 0);
+ filt1 = (v16i8)__msa_splati_h(filt, 1);
+ filt2 = (v16i8)__msa_splati_h(filt, 2);
+ filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LOAD_7VECS_SB(src_tmp, src_stride,
+ src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+
+ XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+ src0, src1, src2, src3, src4, src5, src6, 128);
+
+ ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+ src1, src3, src5, src2, src4, src6,
+ src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+
+ ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+ src1, src3, src5, src2, src4, src6,
+ src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+
+ XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+ ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+ src76_r, src87_r, src98_r, src109_r);
+
+ ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+ src76_l, src87_l, src98_l, src109_l);
+
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+ filt0, filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+ filt0, filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+ filt0, filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+ filt0, filt1, filt2, filt3);
+
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+ filt0, filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+ filt0, filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+ filt0, filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+ filt0, filt1, filt2, filt3);
+
+ out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
+ out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
+ out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
+ out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
+ out0_l = SRARI_SATURATE_SIGNED_H(out0_l, FILTER_BITS, 7);
+ out1_l = SRARI_SATURATE_SIGNED_H(out1_l, FILTER_BITS, 7);
+ out2_l = SRARI_SATURATE_SIGNED_H(out2_l, FILTER_BITS, 7);
+ out3_l = SRARI_SATURATE_SIGNED_H(out3_l, FILTER_BITS, 7);
+
+ out0_r = (v8i16)__msa_pckev_b((v16i8)out0_l, (v16i8)out0_r);
+ out1_r = (v8i16)__msa_pckev_b((v16i8)out1_l, (v16i8)out1_r);
+ out2_r = (v8i16)__msa_pckev_b((v16i8)out2_l, (v16i8)out2_r);
+ out3_r = (v8i16)__msa_pckev_b((v16i8)out3_l, (v16i8)out3_r);
+
+ XORI_B_4VECS_UB(out0_r, out1_r, out2_r, out3_r,
+ tmp0, tmp1, tmp2, tmp3, 128);
+
+ STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3, src4;
+ v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+ v16i8 filt0;
+ v8u16 filt;
+
+ filt = LOAD_UH(filter);
+ filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
+
+ LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ ILVR_B_4VECS_SB(src0, src1, src2, src3, src1, src2, src3, src4,
+ src10_r, src21_r, src32_r, src43_r);
+
+ ILVR_D_2VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r);
+
+ src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
+ src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
+
+ src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
+ src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
+
+ src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
+
+ out0 = __msa_copy_u_w((v4i32)src2110, 0);
+ out1 = __msa_copy_u_w((v4i32)src2110, 1);
+ out2 = __msa_copy_u_w((v4i32)src2110, 2);
+ out3 = __msa_copy_u_w((v4i32)src2110, 3);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ uint32_t out0, out1, out2, out3, out4, out5, out6, out7;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+ v16i8 filt0;
+ v8u16 filt;
+
+ filt = LOAD_UH(filter);
+ filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
+
+ LOAD_8VECS_SB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ src8 = LOAD_SB(src);
+ src += src_stride;
+
+ ILVR_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7,
+ src1, src2, src3, src4, src5, src6, src7, src8,
+ src10_r, src21_r, src32_r, src43_r,
+ src54_r, src65_r, src76_r, src87_r);
+
+ ILVR_D_4VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
+ src6554, src65_r, src54_r, src8776, src87_r, src76_r);
+
+ src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
+ src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
+ src6554 = (v16i8)__msa_dotp_u_h((v16u8)src6554, (v16u8)filt0);
+ src8776 = (v16i8)__msa_dotp_u_h((v16u8)src8776, (v16u8)filt0);
+
+ src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
+ src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
+ src6554 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src6554, FILTER_BITS, 7);
+ src8776 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src8776, FILTER_BITS, 7);
+
+ src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
+ src4332 = (v16i8)__msa_pckev_b((v16i8)src8776, (v16i8)src6554);
+
+ out0 = __msa_copy_u_w((v4i32)src2110, 0);
+ out1 = __msa_copy_u_w((v4i32)src2110, 1);
+ out2 = __msa_copy_u_w((v4i32)src2110, 2);
+ out3 = __msa_copy_u_w((v4i32)src2110, 3);
+ out4 = __msa_copy_u_w((v4i32)src4332, 0);
+ out5 = __msa_copy_u_w((v4i32)src4332, 1);
+ out6 = __msa_copy_u_w((v4i32)src4332, 2);
+ out7 = __msa_copy_u_w((v4i32)src4332, 3);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+ dst += dst_stride;
+ STORE_WORD(dst, out4);
+ dst += dst_stride;
+ STORE_WORD(dst, out5);
+ dst += dst_stride;
+ STORE_WORD(dst, out6);
+ dst += dst_stride;
+ STORE_WORD(dst, out7);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 vec0, vec1, vec2, vec3, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LOAD_5VECS_UB(src, src_stride, src0, src1, src2, src3, src4);
+
+ ILVR_B_2VECS_UB(src0, src1, src1, src2, vec0, vec1);
+ ILVR_B_2VECS_UB(src2, src3, src3, src4, vec2, vec3);
+
+ /* filter calc */
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ src0 = LOAD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LOAD_8VECS_UB(src, src_stride,
+ src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+
+ ILVR_B_4VECS_UB(src0, src1, src2, src3, src1, src2, src3, src4,
+ vec0, vec1, vec2, vec3);
+
+ ILVR_B_4VECS_UB(src4, src5, src6, src7, src5, src6, src7, src8,
+ vec4, vec5, vec6, vec7);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ tmp0 = __msa_dotp_u_h(vec4, filt0);
+ tmp1 = __msa_dotp_u_h(vec5, filt0);
+ tmp2 = __msa_dotp_u_h(vec6, filt0);
+ tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ src0 = LOAD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+ dst += dst_stride;
+
+ ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
+
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
+ dst += dst_stride;
+
+ tmp0 = __msa_dotp_u_h(vec4, filt0);
+ tmp1 = __msa_dotp_u_h(vec5, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+ dst += dst_stride;
+
+ tmp2 = __msa_dotp_u_h(vec6, filt0);
+ tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ src0 = LOAD_UB(src);
+ src5 = LOAD_UB(src + 16);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
+
+ ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+
+ LOAD_4VECS_UB(src + 16, src_stride, src6, src7, src8, src9);
+ src += (4 * src_stride);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
+
+ ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
+
+ tmp0 = __msa_dotp_u_h(vec4, filt0);
+ tmp1 = __msa_dotp_u_h(vec5, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 2 * dst_stride);
+
+ tmp2 = __msa_dotp_u_h(vec6, filt0);
+ tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 3 * dst_stride);
+
+ ILV_B_LRLR_UB(src5, src6, src6, src7, vec1, vec0, vec3, vec2);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16);
+
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + dst_stride);
+
+ ILV_B_LRLR_UB(src7, src8, src8, src9, vec5, vec4, vec7, vec6);
+
+ tmp0 = __msa_dotp_u_h(vec4, filt0);
+ tmp1 = __msa_dotp_u_h(vec5, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16 + 2 * dst_stride);
+
+ tmp2 = __msa_dotp_u_h(vec6, filt0);
+ tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LOAD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LOAD_4VECS_UB(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LOAD_2VECS_UB(src, src_stride, src1, src2);
+ LOAD_2VECS_UB(src + 16, src_stride, src4, src5);
+ LOAD_2VECS_UB(src + 32, src_stride, src7, src8);
+ LOAD_2VECS_UB(src + 48, src_stride, src10, src11);
+ src += (2 * src_stride);
+
+ ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
+
+ ILV_B_LRLR_UB(src3, src4, src4, src5, vec5, vec4, vec7, vec6);
+
+ tmp4 = __msa_dotp_u_h(vec4, filt0);
+ tmp5 = __msa_dotp_u_h(vec5, filt0);
+
+ tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
+ tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 16);
+
+ tmp6 = __msa_dotp_u_h(vec6, filt0);
+ tmp7 = __msa_dotp_u_h(vec7, filt0);
+
+ tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
+ tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 16 + dst_stride);
+
+ ILV_B_LRLR_UB(src6, src7, src7, src8, vec1, vec0, vec3, vec2);
+
+ tmp0 = __msa_dotp_u_h(vec0, filt0);
+ tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+ tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+ tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 32);
+
+ tmp2 = __msa_dotp_u_h(vec2, filt0);
+ tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+ tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+ tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 32 + dst_stride);
+
+ ILV_B_LRLR_UB(src9, src10, src10, src11, vec5, vec4, vec7, vec6);
+
+ tmp4 = __msa_dotp_u_h(vec4, filt0);
+ tmp5 = __msa_dotp_u_h(vec5, filt0);
+
+ tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
+ tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 48);
+
+ tmp6 = __msa_dotp_u_h(vec6, filt0);
+ tmp7 = __msa_dotp_u_h(vec7, filt0);
+
+ tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
+ tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
+
+ PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_ver[8];
+
+ if (16 != y_step_q4) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_y)[1] == 0x800000) {
+ vp9_convolve_copy(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 8; cnt--;) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_vt_2t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 32:
+ common_vt_8t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c
new file mode 100644
index 00000000000..72b8ab71397
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint32_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ if (0 == (height % 4)) {
+ for (cnt = (height / 4); cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ dst0 = __msa_aver_u_b(src0, dst0);
+ dst1 = __msa_aver_u_b(src1, dst1);
+ dst2 = __msa_aver_u_b(src2, dst2);
+ dst3 = __msa_aver_u_b(src3, dst3);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ out2 = __msa_copy_u_w((v4i32)dst2, 0);
+ out3 = __msa_copy_u_w((v4i32)dst3, 0);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ STORE_WORD(dst, out2);
+ dst += dst_stride;
+ STORE_WORD(dst, out3);
+ dst += dst_stride;
+ }
+ } else if (0 == (height % 2)) {
+ for (cnt = (height / 2); cnt--;) {
+ LOAD_2VECS_UB(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+
+ LOAD_2VECS_UB(dst, dst_stride, dst0, dst1);
+
+ dst0 = __msa_aver_u_b(src0, dst0);
+ dst1 = __msa_aver_u_b(src1, dst1);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+
+ STORE_WORD(dst, out0);
+ dst += dst_stride;
+ STORE_WORD(dst, out1);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (cnt = (height / 4); cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ dst0 = __msa_aver_u_b(src0, dst0);
+ dst1 = __msa_aver_u_b(src1, dst1);
+ dst2 = __msa_aver_u_b(src2, dst2);
+ dst3 = __msa_aver_u_b(src3, dst3);
+
+ out0 = __msa_copy_u_d((v2i64)dst0, 0);
+ out1 = __msa_copy_u_d((v2i64)dst1, 0);
+ out2 = __msa_copy_u_d((v2i64)dst2, 0);
+ out3 = __msa_copy_u_d((v2i64)dst3, 0);
+
+ STORE_DWORD(dst, out0);
+ dst += dst_stride;
+ STORE_DWORD(dst, out1);
+ dst += dst_stride;
+ STORE_DWORD(dst, out2);
+ dst += dst_stride;
+ STORE_DWORD(dst, out3);
+ dst += dst_stride;
+ }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ for (cnt = (height / 8); cnt--;) {
+ LOAD_8VECS_UB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ LOAD_8VECS_UB(dst, dst_stride,
+ dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+ dst0 = __msa_aver_u_b(src0, dst0);
+ dst1 = __msa_aver_u_b(src1, dst1);
+ dst2 = __msa_aver_u_b(src2, dst2);
+ dst3 = __msa_aver_u_b(src3, dst3);
+ dst4 = __msa_aver_u_b(src4, dst4);
+ dst5 = __msa_aver_u_b(src5, dst5);
+ dst6 = __msa_aver_u_b(src6, dst6);
+ dst7 = __msa_aver_u_b(src7, dst7);
+
+ STORE_8VECS_UB(dst, dst_stride,
+ dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ dst += (8 * dst_stride);
+ }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 8); cnt--;) {
+ src0 = LOAD_UB(src);
+ src1 = LOAD_UB(src + 16);
+ src += src_stride;
+ src2 = LOAD_UB(src);
+ src3 = LOAD_UB(src + 16);
+ src += src_stride;
+ src4 = LOAD_UB(src);
+ src5 = LOAD_UB(src + 16);
+ src += src_stride;
+ src6 = LOAD_UB(src);
+ src7 = LOAD_UB(src + 16);
+ src += src_stride;
+
+ dst0 = LOAD_UB(dst_dup);
+ dst1 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+ dst2 = LOAD_UB(dst_dup);
+ dst3 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+ dst4 = LOAD_UB(dst_dup);
+ dst5 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+ dst6 = LOAD_UB(dst_dup);
+ dst7 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+
+ src8 = LOAD_UB(src);
+ src9 = LOAD_UB(src + 16);
+ src += src_stride;
+ src10 = LOAD_UB(src);
+ src11 = LOAD_UB(src + 16);
+ src += src_stride;
+ src12 = LOAD_UB(src);
+ src13 = LOAD_UB(src + 16);
+ src += src_stride;
+ src14 = LOAD_UB(src);
+ src15 = LOAD_UB(src + 16);
+ src += src_stride;
+
+ dst8 = LOAD_UB(dst_dup);
+ dst9 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+ dst10 = LOAD_UB(dst_dup);
+ dst11 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+ dst12 = LOAD_UB(dst_dup);
+ dst13 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+ dst14 = LOAD_UB(dst_dup);
+ dst15 = LOAD_UB(dst_dup + 16);
+ dst_dup += dst_stride;
+
+ dst0 = __msa_aver_u_b(src0, dst0);
+ dst1 = __msa_aver_u_b(src1, dst1);
+ dst2 = __msa_aver_u_b(src2, dst2);
+ dst3 = __msa_aver_u_b(src3, dst3);
+ dst4 = __msa_aver_u_b(src4, dst4);
+ dst5 = __msa_aver_u_b(src5, dst5);
+ dst6 = __msa_aver_u_b(src6, dst6);
+ dst7 = __msa_aver_u_b(src7, dst7);
+ dst8 = __msa_aver_u_b(src8, dst8);
+ dst9 = __msa_aver_u_b(src9, dst9);
+ dst10 = __msa_aver_u_b(src10, dst10);
+ dst11 = __msa_aver_u_b(src11, dst11);
+ dst12 = __msa_aver_u_b(src12, dst12);
+ dst13 = __msa_aver_u_b(src13, dst13);
+ dst14 = __msa_aver_u_b(src14, dst14);
+ dst15 = __msa_aver_u_b(src15, dst15);
+
+ STORE_UB(dst0, dst);
+ STORE_UB(dst1, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst2, dst);
+ STORE_UB(dst3, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst4, dst);
+ STORE_UB(dst5, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst6, dst);
+ STORE_UB(dst7, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst8, dst);
+ STORE_UB(dst9, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst10, dst);
+ STORE_UB(dst11, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst12, dst);
+ STORE_UB(dst13, dst + 16);
+ dst += dst_stride;
+ STORE_UB(dst14, dst);
+ STORE_UB(dst15, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 4); cnt--;) {
+ LOAD_4VECS_UB(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LOAD_4VECS_UB(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+ LOAD_4VECS_UB(src, 16, src8, src9, src10, src11);
+ src += src_stride;
+ LOAD_4VECS_UB(src, 16, src12, src13, src14, src15);
+ src += src_stride;
+
+ LOAD_4VECS_UB(dst_dup, 16, dst0, dst1, dst2, dst3);
+ dst_dup += dst_stride;
+ LOAD_4VECS_UB(dst_dup, 16, dst4, dst5, dst6, dst7);
+ dst_dup += dst_stride;
+ LOAD_4VECS_UB(dst_dup, 16, dst8, dst9, dst10, dst11);
+ dst_dup += dst_stride;
+ LOAD_4VECS_UB(dst_dup, 16, dst12, dst13, dst14, dst15);
+ dst_dup += dst_stride;
+
+ dst0 = __msa_aver_u_b(src0, dst0);
+ dst1 = __msa_aver_u_b(src1, dst1);
+ dst2 = __msa_aver_u_b(src2, dst2);
+ dst3 = __msa_aver_u_b(src3, dst3);
+ dst4 = __msa_aver_u_b(src4, dst4);
+ dst5 = __msa_aver_u_b(src5, dst5);
+ dst6 = __msa_aver_u_b(src6, dst6);
+ dst7 = __msa_aver_u_b(src7, dst7);
+ dst8 = __msa_aver_u_b(src8, dst8);
+ dst9 = __msa_aver_u_b(src9, dst9);
+ dst10 = __msa_aver_u_b(src10, dst10);
+ dst11 = __msa_aver_u_b(src11, dst11);
+ dst12 = __msa_aver_u_b(src12, dst12);
+ dst13 = __msa_aver_u_b(src13, dst13);
+ dst14 = __msa_aver_u_b(src14, dst14);
+ dst15 = __msa_aver_u_b(src15, dst15);
+
+ STORE_4VECS_UB(dst, 16, dst0, dst1, dst2, dst3);
+ dst += dst_stride;
+ STORE_4VECS_UB(dst, 16, dst4, dst5, dst6, dst7);
+ dst += dst_stride;
+ STORE_4VECS_UB(dst, 16, dst8, dst9, dst10, dst11);
+ dst += dst_stride;
+ STORE_4VECS_UB(dst, 16, dst12, dst13, dst14, dst15);
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 4: {
+ avg_width4_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 8: {
+ avg_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ avg_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int32_t lp, cnt;
+ for (cnt = h; cnt--;) {
+ for (lp = 0; lp < w; ++lp) {
+ dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c
new file mode 100644
index 00000000000..064ba762fa0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LOAD_8VECS_UB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ STORE_DWORD(dst, out0);
+ dst += dst_stride;
+ STORE_DWORD(dst, out1);
+ dst += dst_stride;
+ STORE_DWORD(dst, out2);
+ dst += dst_stride;
+ STORE_DWORD(dst, out3);
+ dst += dst_stride;
+ STORE_DWORD(dst, out4);
+ dst += dst_stride;
+ STORE_DWORD(dst, out5);
+ dst += dst_stride;
+ STORE_DWORD(dst, out6);
+ dst += dst_stride;
+ STORE_DWORD(dst, out7);
+ dst += dst_stride;
+
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+ STORE_DWORD(dst, out0);
+ dst += dst_stride;
+ STORE_DWORD(dst, out1);
+ dst += dst_stride;
+ STORE_DWORD(dst, out2);
+ dst += dst_stride;
+ STORE_DWORD(dst, out3);
+ dst += dst_stride;
+ }
+ } else if (0 == height % 8) {
+ for (cnt = height >> 3; cnt--;) {
+ LOAD_8VECS_UB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ STORE_DWORD(dst, out0);
+ dst += dst_stride;
+ STORE_DWORD(dst, out1);
+ dst += dst_stride;
+ STORE_DWORD(dst, out2);
+ dst += dst_stride;
+ STORE_DWORD(dst, out3);
+ dst += dst_stride;
+ STORE_DWORD(dst, out4);
+ dst += dst_stride;
+ STORE_DWORD(dst, out5);
+ dst += dst_stride;
+ STORE_DWORD(dst, out6);
+ dst += dst_stride;
+ STORE_DWORD(dst, out7);
+ dst += dst_stride;
+ }
+ } else if (0 == height % 4) {
+ for (cnt = (height / 4); cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+ STORE_DWORD(dst, out0);
+ dst += dst_stride;
+ STORE_DWORD(dst, out1);
+ dst += dst_stride;
+ STORE_DWORD(dst, out2);
+ dst += dst_stride;
+ STORE_DWORD(dst, out3);
+ dst += dst_stride;
+ }
+ } else if (0 == height % 2) {
+ for (cnt = (height / 2); cnt--;) {
+ LOAD_2VECS_UB(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+ STORE_DWORD(dst, out0);
+ dst += dst_stride;
+ STORE_DWORD(dst, out1);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width) {
+ int32_t cnt, loop_cnt;
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LOAD_8VECS_UB(src_tmp, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src_tmp += (8 * src_stride);
+
+ STORE_8VECS_UB(dst_tmp, dst_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ dst_tmp += (8 * dst_stride);
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LOAD_8VECS_UB(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ STORE_8VECS_UB(dst, dst_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ dst += (8 * dst_stride);
+
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+
+ STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+ STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+ dst += (4 * dst_stride);
+
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+
+ STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+ STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+ dst += (4 * dst_stride);
+
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+
+ STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+ STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+ LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+
+ STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+ STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 4: {
+ uint32_t cnt, tmp;
+ /* 1 word storage */
+ for (cnt = h; cnt--;) {
+ tmp = LOAD_WORD(src);
+ STORE_WORD(dst, tmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ copy_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ copy_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ uint32_t cnt;
+ for (cnt = h; cnt--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
new file mode 100644
index 00000000000..b109a4014f8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
+#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
+
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \
+ filt_h0, filt_h1, filt_h2, filt_h3) ({ \
+ v8i16 vec0, vec1, vec2, vec3, horiz_out; \
+ \
+ vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \
+ vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
+ vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \
+ vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
+ vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \
+ vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
+ vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \
+ vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \
+ vec0 = __msa_adds_s_h(vec0, vec2); \
+ horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
+ \
+ horiz_out; \
+})
+
+#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \
+ filt_h0, filt_h1, filt_h2, filt_h3) ({ \
+ v8i16 vec0, vec1, vec2, vec3, horiz_out; \
+ \
+ vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
+ vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
+ vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
+ vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
+ vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
+ vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
+ vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
+ vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \
+ vec0 = __msa_adds_s_h(vec0, vec2); \
+ horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
+ \
+ horiz_out; \
+})
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
+ filt0, filt1, filt2, filt3) ({ \
+ v8i16 tmp0, tmp1; \
+ \
+ tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \
+ tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \
+ tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \
+ tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \
+ tmp0 = __msa_adds_s_h(tmp0, tmp1); \
+ \
+ tmp0; \
+})
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
+ mask0, mask1, mask2, mask3, \
+ filt0, filt1, filt2, filt3, \
+ out0, out1) { \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
+ vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \
+ \
+ res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
+ res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
+ \
+ vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
+ vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \
+ \
+ res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \
+ res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \
+ \
+ vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
+ vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \
+ \
+ res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \
+ res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \
+ \
+ vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
+ vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \
+ \
+ res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \
+ res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \
+ \
+ out0 = __msa_adds_s_h(res0_m, res2_m); \
+ out1 = __msa_adds_s_h(res1_m, res3_m); \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
+ mask0, mask1, mask2, mask3, \
+ filt0, filt1, filt2, filt3, \
+ out0, out1, out2, out3) { \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ v8i16 res4_m, res5_m, res6_m, res7_m; \
+ \
+ vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \
+ vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \
+ vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \
+ vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \
+ \
+ res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
+ res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
+ res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \
+ res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \
+ \
+ vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \
+ vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \
+ vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \
+ vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \
+ \
+ res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \
+ res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \
+ res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \
+ res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \
+ \
+ vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \
+ vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \
+ vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \
+ vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \
+ \
+ res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \
+ res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \
+ res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \
+ res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \
+ \
+ vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \
+ vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \
+ vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \
+ vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \
+ \
+ res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \
+ res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \
+ res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \
+ res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \
+ \
+ out0 = __msa_adds_s_h(res0_m, res4_m); \
+ out1 = __msa_adds_s_h(res1_m, res5_m); \
+ out2 = __msa_adds_s_h(res2_m, res6_m); \
+ out3 = __msa_adds_s_h(res3_m, res7_m); \
+}
+#endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
new file mode 100644
index 00000000000..e5c0eaa32f3
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -0,0 +1,948 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define SET_COSPI_PAIR(c0_h, c1_h) ({ \
+ v8i16 out0, r0_m, r1_m; \
+ \
+ r0_m = __msa_fill_h(c0_h); \
+ r1_m = __msa_fill_h(c1_h); \
+ out0 = __msa_ilvev_h(r1_m, r0_m); \
+ \
+ out0; \
+})
+
+#define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) { \
+ v8i16 k0_m = __msa_fill_h(const0); \
+ v8i16 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = __msa_fill_h(const1); \
+ k0_m = __msa_ilvev_h(s0_m, k0_m); \
+ \
+ s0_m = __msa_ilvl_h(-reg1, reg0); \
+ s1_m = __msa_ilvr_h(-reg1, reg0); \
+ s2_m = __msa_ilvl_h(reg0, reg1); \
+ s3_m = __msa_ilvr_h(reg0, reg1); \
+ s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m); \
+ s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m); \
+ s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \
+ s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h(s0_m, s1_m); \
+ \
+ s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m); \
+ s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m); \
+ s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \
+ s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h(s0_m, s1_m); \
+}
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
+ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
+ v8i16 madd_s0_m, madd_s1_m; \
+ \
+ ILV_H_LR_SH(m0, m1, madd_s1_m, madd_s0_m); \
+ \
+ DOTP_S_W_4VECS_SW(madd_s0_m, c0, madd_s1_m, c0, \
+ madd_s0_m, c1, madd_s1_m, c1, \
+ madd0_m, madd1_m, madd2_m, madd3_m); \
+ \
+ SRARI_W_4VECS_SW(madd0_m, madd1_m, madd2_m, madd3_m, \
+ madd0_m, madd1_m, madd2_m, madd3_m, \
+ DCT_CONST_BITS); \
+ \
+ PCKEV_H_2VECS_SH(madd1_m, madd0_m, madd3_m, madd2_m, \
+ res0, res1); \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3, \
+ cst0, cst1, cst2, cst3, \
+ out0, out1, out2, out3) { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v4i32 m4_m, m5_m; \
+ \
+ ILV_H_LRLR_SH(inp0, inp1, inp2, inp3, \
+ madd_s1_m, madd_s0_m, madd_s3_m, madd_s2_m); \
+ \
+ DOTP_S_W_4VECS_SW(madd_s0_m, cst0, madd_s1_m, cst0, \
+ madd_s2_m, cst2, madd_s3_m, cst2, \
+ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ \
+ m4_m = tmp0_m + tmp2_m; \
+ m5_m = tmp1_m + tmp3_m; \
+ tmp3_m = tmp1_m - tmp3_m; \
+ tmp2_m = tmp0_m - tmp2_m; \
+ \
+ SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m, \
+ m4_m, m5_m, tmp2_m, tmp3_m, \
+ DCT_CONST_BITS); \
+ \
+ PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
+ \
+ DOTP_S_W_4VECS_SW(madd_s0_m, cst1, madd_s1_m, cst1, \
+ madd_s2_m, cst3, madd_s3_m, cst3, \
+ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ \
+ m4_m = tmp0_m + tmp2_m; \
+ m5_m = tmp1_m + tmp3_m; \
+ tmp3_m = tmp1_m - tmp3_m; \
+ tmp2_m = tmp0_m - tmp2_m; \
+ \
+ SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m, \
+ m4_m, m5_m, tmp2_m, tmp3_m, \
+ DCT_CONST_BITS); \
+ \
+ PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
+}
+
+#define TRANSPOSE8x8_H1(in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7) { \
+ v8i16 loc0_m, loc1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ loc0_m = __msa_ilvr_h((in6), (in4)); \
+ loc1_m = __msa_ilvr_h((in7), (in5)); \
+ tmp0_m = __msa_ilvr_h(loc1_m, loc0_m); \
+ tmp1_m = __msa_ilvl_h(loc1_m, loc0_m); \
+ \
+ loc0_m = __msa_ilvl_h((in6), (in4)); \
+ loc1_m = __msa_ilvl_h((in7), (in5)); \
+ tmp2_m = __msa_ilvr_h(loc1_m, loc0_m); \
+ tmp3_m = __msa_ilvl_h(loc1_m, loc0_m); \
+ \
+ loc0_m = __msa_ilvr_h((in2), (in0)); \
+ loc1_m = __msa_ilvr_h((in3), (in1)); \
+ tmp4_m = __msa_ilvr_h(loc1_m, loc0_m); \
+ tmp5_m = __msa_ilvl_h(loc1_m, loc0_m); \
+ \
+ loc0_m = __msa_ilvl_h((in2), (in0)); \
+ loc1_m = __msa_ilvl_h((in3), (in1)); \
+ tmp6_m = __msa_ilvr_h(loc1_m, loc0_m); \
+ tmp7_m = __msa_ilvl_h(loc1_m, loc0_m); \
+ \
+ out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+ out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, \
+ r8, r9, r10, r11, r12, r13, r14, r15, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ out8, out9, out10, out11, \
+ out12, out13, out14, out15) { \
+ v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
+ v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
+ v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
+ v8i16 h8_m, h9_m, h10_m, h11_m; \
+ v8i16 k0_m, k1_m, k2_m, k3_m; \
+ \
+ /* stage 1 */ \
+ k0_m = SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
+ k1_m = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
+ k2_m = SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
+ k3_m = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
+ VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
+ g0_m, g1_m, g2_m, g3_m); \
+ \
+ k0_m = SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
+ k1_m = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
+ k2_m = SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
+ k3_m = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
+ VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
+ g4_m, g5_m, g6_m, g7_m); \
+ \
+ k0_m = SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
+ k1_m = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
+ k2_m = SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
+ k3_m = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
+ VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
+ g8_m, g9_m, g10_m, g11_m); \
+ \
+ k0_m = SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
+ k1_m = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
+ k2_m = SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
+ k3_m = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
+ VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
+ g12_m, g13_m, g14_m, g15_m); \
+ \
+ /* stage 2 */ \
+ k0_m = SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
+ k1_m = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
+ k2_m = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
+ VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
+ h0_m, h1_m, h2_m, h3_m); \
+ \
+ k0_m = SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
+ k1_m = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
+ k2_m = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
+ VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
+ h4_m, h5_m, h6_m, h7_m); \
+ \
+ BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
+ \
+ BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
+ h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
+ \
+ /* stage 3 */ \
+ BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
+ \
+ k0_m = SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
+ k1_m = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
+ k2_m = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
+ VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
+ out4, out6, out5, out7); \
+ VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
+ out12, out14, out13, out15); \
+ \
+ /* stage 4 */ \
+ k0_m = SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
+ k1_m = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
+ k2_m = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
+ k3_m = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
+ VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
+ VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
+ VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
+ VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
+}
+
+#define VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, \
+ in0, in1, in2, in3) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ v16u8 dest0_m, dest1_m, dest2_m, dest3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ v16i8 zero_m = { 0 }; \
+ uint8_t *dst_m = (uint8_t *)(dest); \
+ \
+ LOAD_4VECS_UB(dst_m, (dest_stride), \
+ dest0_m, dest1_m, dest2_m, dest3_m); \
+ \
+ res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m); \
+ res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m); \
+ res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m); \
+ res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m); \
+ \
+ res0_m += (v8i16)(in0); \
+ res1_m += (v8i16)(in1); \
+ res2_m += (v8i16)(in2); \
+ res3_m += (v8i16)(in3); \
+ \
+ res0_m = CLIP_UNSIGNED_CHAR_H(res0_m); \
+ res1_m = CLIP_UNSIGNED_CHAR_H(res1_m); \
+ res2_m = CLIP_UNSIGNED_CHAR_H(res2_m); \
+ res3_m = CLIP_UNSIGNED_CHAR_H(res3_m); \
+ \
+ tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m); \
+ tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
+ out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
+ out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
+ out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
+ \
+ STORE_DWORD(dst_m, out0_m); \
+ dst_m += (dest_stride); \
+ STORE_DWORD(dst_m, out1_m); \
+ dst_m += (dest_stride); \
+ STORE_DWORD(dst_m, out2_m); \
+ dst_m += (dest_stride); \
+ STORE_DWORD(dst_m, out3_m); \
+}
+
+void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
+ v8i16 loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+ v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+ v8i16 tmp5, tmp6, tmp7;
+
+ /* load left top 8x8 */
+ LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ /* load right top 8x8 */
+ LOAD_8VECS_SH((input + 8), 16,
+ reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ /* transpose block */
+ TRANSPOSE8x8_H1(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+ reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ /* transpose block */
+ TRANSPOSE8x8_H1(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
+ reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+
+ loc0 = reg2 + reg10;
+ reg2 = reg2 - reg10;
+ loc1 = reg14 + reg6;
+ reg14 = reg14 - reg6;
+
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+
+ reg14 = reg8 - reg12;
+ reg2 = reg8 + reg12;
+ reg10 = reg0 - reg4;
+ reg6 = reg0 + reg4;
+
+ reg0 = reg2 - loc1;
+ reg2 = reg2 + loc1;
+ reg12 = reg14 - loc0;
+ reg14 = reg14 + loc0;
+ reg4 = reg6 - loc3;
+ reg6 = reg6 + loc3;
+ reg8 = reg10 - loc2;
+ reg10 = reg10 + loc2;
+
+ /* stage 2 */
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+ reg9 = reg1 - loc2;
+ reg1 = reg1 + loc2;
+ reg7 = reg15 - loc3;
+ reg15 = reg15 + loc3;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+
+ reg13 = loc0 + reg5;
+ reg5 = loc0 - reg5;
+ reg3 = loc1 + reg11;
+ reg11 = loc1 - reg11;
+
+ loc1 = reg15 + reg3;
+ reg3 = reg15 - reg3;
+ loc2 = reg2 + loc1;
+ reg15 = reg2 - loc1;
+
+ loc1 = reg1 + reg13;
+ reg13 = reg1 - reg13;
+ loc0 = reg0 + loc1;
+ loc1 = reg0 - loc1;
+ tmp6 = loc0;
+ tmp7 = loc1;
+ reg0 = loc2;
+
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+ loc0 = reg9 + reg5;
+ reg5 = reg9 - reg5;
+ reg2 = reg6 + loc0;
+ reg1 = reg6 - loc0;
+
+ loc0 = reg7 + reg11;
+ reg11 = reg7 - reg11;
+ loc1 = reg4 + loc0;
+ loc2 = reg4 - loc0;
+ tmp5 = loc1;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+
+ loc0 = reg8 + reg5;
+ loc1 = reg8 - reg5;
+ reg4 = reg10 + reg11;
+ reg9 = reg10 - reg11;
+ reg10 = loc0;
+ reg11 = loc1;
+
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+
+ reg8 = reg12 + reg3;
+ reg5 = reg12 - reg3;
+ reg6 = reg14 + reg13;
+ reg7 = reg14 - reg13;
+ reg13 = loc2;
+
+ /* Transpose and store the output */
+ reg12 = tmp5;
+ reg14 = tmp6;
+ reg3 = tmp7;
+
+ /* transpose block */
+ TRANSPOSE8x8_H1(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+ reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+
+ STORE_8VECS_SH(output, 16, reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+
+ /* transpose block */
+ TRANSPOSE8x8_H1(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+ reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+
+ STORE_8VECS_SH((output + 8), 16,
+ reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+}
+
+void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ v8i16 loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+ v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+ v8i16 tmp5, tmp6, tmp7;
+
+ /* load up 8x8 */
+ LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ /* load bottom 8x8 */
+ LOAD_8VECS_SH((input + 8 * 16), 16,
+ reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+ DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+ DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+
+ loc0 = reg2 + reg10;
+ reg2 = reg2 - reg10;
+ loc1 = reg14 + reg6;
+ reg14 = reg14 - reg6;
+
+ DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+ DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+ DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+
+ reg14 = reg8 - reg12;
+ reg2 = reg8 + reg12;
+ reg10 = reg0 - reg4;
+ reg6 = reg0 + reg4;
+
+ reg0 = reg2 - loc1;
+ reg2 = reg2 + loc1;
+ reg12 = reg14 - loc0;
+ reg14 = reg14 + loc0;
+ reg4 = reg6 - loc3;
+ reg6 = reg6 + loc3;
+ reg8 = reg10 - loc2;
+ reg10 = reg10 + loc2;
+
+ /* stage 2 */
+ DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+ DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+ reg9 = reg1 - loc2;
+ reg1 = reg1 + loc2;
+ reg7 = reg15 - loc3;
+ reg15 = reg15 + loc3;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+ DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+
+ reg13 = loc0 + reg5;
+ reg5 = loc0 - reg5;
+ reg3 = loc1 + reg11;
+ reg11 = loc1 - reg11;
+
+ loc1 = reg15 + reg3;
+ reg3 = reg15 - reg3;
+ loc2 = reg2 + loc1;
+ reg15 = reg2 - loc1;
+
+ loc1 = reg1 + reg13;
+ reg13 = reg1 - reg13;
+ loc0 = reg0 + loc1;
+ loc1 = reg0 - loc1;
+ tmp6 = loc0;
+ tmp7 = loc1;
+ reg0 = loc2;
+
+ DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+ DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+ loc0 = reg9 + reg5;
+ reg5 = reg9 - reg5;
+ reg2 = reg6 + loc0;
+ reg1 = reg6 - loc0;
+
+ loc0 = reg7 + reg11;
+ reg11 = reg7 - reg11;
+ loc1 = reg4 + loc0;
+ loc2 = reg4 - loc0;
+ tmp5 = loc1;
+
+ DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+
+ loc0 = reg8 + reg5;
+ loc1 = reg8 - reg5;
+ reg4 = reg10 + reg11;
+ reg9 = reg10 - reg11;
+ reg10 = loc0;
+ reg11 = loc1;
+
+ DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+
+ reg8 = reg12 + reg3;
+ reg5 = reg12 - reg3;
+ reg6 = reg14 + reg13;
+ reg7 = reg14 - reg13;
+ reg13 = loc2;
+
+ /* Transpose and store the output */
+ reg12 = tmp5;
+ reg14 = tmp6;
+ reg3 = tmp7;
+
+ SRARI_H_4VECS_SH(reg0, reg2, reg4, reg6, reg0, reg2, reg4, reg6, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride,
+ reg0, reg2, reg4, reg6);
+ SRARI_H_4VECS_SH(reg8, reg10, reg12, reg14, reg8, reg10, reg12, reg14, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (4 * dest_stride)),
+ dest_stride, reg8, reg10, reg12, reg14);
+ SRARI_H_4VECS_SH(reg3, reg13, reg11, reg5, reg3, reg13, reg11, reg5, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (8 * dest_stride)),
+ dest_stride, reg3, reg13, reg11, reg5);
+ SRARI_H_4VECS_SH(reg7, reg9, reg1, reg15, reg7, reg9, reg1, reg15, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (12 * dest_stride)),
+ dest_stride, reg7, reg9, reg1, reg15);
+}
+
+void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+ int16_t *out = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)),
+ dest_stride);
+ }
+}
+
+void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ uint8_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+ int16_t *out = out_arr;
+
+ /* process 16 * 8 block */
+ vp9_idct16_1d_rows_msa(input, out);
+
+ /* short case just considers top 4 rows as valid output */
+ out += 4 * 16;
+ for (i = 12; i--;) {
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[out]) \n\t"
+ "sw $zero, 4(%[out]) \n\t"
+ "sw $zero, 8(%[out]) \n\t"
+ "sw $zero, 12(%[out]) \n\t"
+ "sw $zero, 16(%[out]) \n\t"
+ "sw $zero, 20(%[out]) \n\t"
+ "sw $zero, 24(%[out]) \n\t"
+ "sw $zero, 28(%[out]) \n\t"
+
+ :
+ : [out] "r" (out)
+ );
+
+ out += 16;
+ }
+
+ out = out_arr;
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)),
+ dest_stride);
+ }
+}
+
+void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ uint8_t i;
+ int32_t const1;
+ int16_t out;
+ v8i16 const2, res0, res1, res2, res3, res4, res5, res6, res7;
+ v16u8 dest0, dest1, dest2, dest3;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16i8 zero = { 0 };
+
+ out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ const1 = ROUND_POWER_OF_TWO(out, 6);
+
+ const2 = __msa_fill_h(const1);
+
+ for (i = 0; i < 4; ++i) {
+ LOAD_4VECS_UB(dest, dest_stride, dest0, dest1, dest2, dest3);
+
+ res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0);
+ res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1);
+ res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2);
+ res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3);
+ res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0);
+ res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1);
+ res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2);
+ res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3);
+
+ res0 += const2;
+ res1 += const2;
+ res2 += const2;
+ res3 += const2;
+ res4 += const2;
+ res5 += const2;
+ res6 += const2;
+ res7 += const2;
+
+ res0 = CLIP_UNSIGNED_CHAR_H(res0);
+ res1 = CLIP_UNSIGNED_CHAR_H(res1);
+ res2 = CLIP_UNSIGNED_CHAR_H(res2);
+ res3 = CLIP_UNSIGNED_CHAR_H(res3);
+ res4 = CLIP_UNSIGNED_CHAR_H(res4);
+ res5 = CLIP_UNSIGNED_CHAR_H(res5);
+ res6 = CLIP_UNSIGNED_CHAR_H(res6);
+ res7 = CLIP_UNSIGNED_CHAR_H(res7);
+
+ tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0);
+ tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1);
+ tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2);
+ tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3);
+
+ STORE_4VECS_UB(dest, dest_stride, tmp0, tmp1, tmp2, tmp3);
+ dest += (4 * dest_stride);
+ }
+}
+
+static void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LOAD_16VECS_SH(input, 8,
+ l0, l8, l1, l9, l2, l10, l3, l11,
+ l4, l12, l5, l13, l6, l14, l7, l15);
+
+ TRANSPOSE8x8_H_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+ l0, l1, l2, l3, l4, l5, l6, l7);
+
+ TRANSPOSE8x8_H_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+ l8, l9, l10, l11, l12, l13, l14, l15);
+
+ /* ADST in horizontal */
+ VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+ l8, l9, l10, l11, l12, l13, l14, l15,
+ r0, r1, r2, r3, r4, r5, r6, r7,
+ r8, r9, r10, r11, r12, r13, r14, r15);
+
+ l1 = -r8;
+ l3 = -r4;
+ l13 = -r13;
+ l15 = -r1;
+
+ TRANSPOSE8x8_H_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+ l0, l1, l2, l3, l4, l5, l6, l7);
+
+ STORE_8VECS_SH(output, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+
+ TRANSPOSE8x8_H_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+ l8, l9, l10, l11, l12, l13, l14, l15);
+
+ STORE_8VECS_SH((output + 8), 16, l8, l9, l10, l11, l12, l13, l14, l15);
+}
+
+static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+ v16u8 dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7;
+ v16u8 dest8, dest9, dest10, dest11, dest12, dest13, dest14, dest15;
+ v16i8 zero = { 0 };
+
+ r0 = LOAD_SH(input + 0 * 16);
+ r3 = LOAD_SH(input + 3 * 16);
+ r4 = LOAD_SH(input + 4 * 16);
+ r7 = LOAD_SH(input + 7 * 16);
+ r8 = LOAD_SH(input + 8 * 16);
+ r11 = LOAD_SH(input + 11 * 16);
+ r12 = LOAD_SH(input + 12 * 16);
+ r15 = LOAD_SH(input + 15 * 16);
+
+ /* stage 1 */
+ k0 = SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+ k1 = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+ k2 = SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+ k3 = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+ VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ k0 = SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+ k1 = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+ k2 = SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+ k3 = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+ VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+
+ k0 = SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+ k1 = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+ k2 = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+ VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+ r1 = LOAD_SH(input + 1 * 16);
+ r2 = LOAD_SH(input + 2 * 16);
+ r5 = LOAD_SH(input + 5 * 16);
+ r6 = LOAD_SH(input + 6 * 16);
+ r9 = LOAD_SH(input + 9 * 16);
+ r10 = LOAD_SH(input + 10 * 16);
+ r13 = LOAD_SH(input + 13 * 16);
+ r14 = LOAD_SH(input + 14 * 16);
+
+ k0 = SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+ k1 = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+ k2 = SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+ k3 = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+ VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+
+ k0 = SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+ k1 = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+ k2 = SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+ k3 = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+ VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+
+ BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+
+ BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+ out1 = -out1;
+ out0 = __msa_srari_h(out0, 6);
+ out1 = __msa_srari_h(out1, 6);
+ dest0 = LOAD_UB(dest + 0 * dest_stride);
+ dest1 = LOAD_UB(dest + 15 * dest_stride);
+ res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0);
+ res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1);
+ res0 += out0;
+ res1 += out1;
+ res0 = CLIP_UNSIGNED_CHAR_H(res0);
+ res1 = CLIP_UNSIGNED_CHAR_H(res1);
+ res0 = (v8i16)__msa_pckev_b((v16i8)res0, (v16i8)res0);
+ res1 = (v8i16)__msa_pckev_b((v16i8)res1, (v16i8)res1);
+ STORE_DWORD(dest, __msa_copy_u_d((v2i64)res0, 0));
+ STORE_DWORD(dest + 15 * dest_stride, __msa_copy_u_d((v2i64)res1, 0));
+
+ k0 = SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+ k1 = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+ k2 = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+ VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+
+ out8 = __msa_srari_h(out8, 6);
+ out9 = __msa_srari_h(out9, 6);
+ dest8 = LOAD_UB(dest + 1 * dest_stride);
+ dest9 = LOAD_UB(dest + 14 * dest_stride);
+ res8 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest8);
+ res9 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest9);
+ res8 += out8;
+ res9 += out9;
+ res8 = CLIP_UNSIGNED_CHAR_H(res8);
+ res9 = CLIP_UNSIGNED_CHAR_H(res9);
+ res8 = (v8i16)__msa_pckev_b((v16i8)res8, (v16i8)res8);
+ res9 = (v8i16)__msa_pckev_b((v16i8)res9, (v16i8)res9);
+ STORE_DWORD(dest + dest_stride, __msa_copy_u_d((v2i64)res8, 0));
+ STORE_DWORD(dest + 14 * dest_stride, __msa_copy_u_d((v2i64)res9, 0));
+
+ k0 = SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+ k1 = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+ k2 = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+ VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ out4 = __msa_srari_h(out4, 6);
+ out5 = __msa_srari_h(out5, 6);
+ dest4 = LOAD_UB(dest + 3 * dest_stride);
+ dest5 = LOAD_UB(dest + 12 * dest_stride);
+ res4 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest4);
+ res5 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest5);
+ res4 += out4;
+ res5 += out5;
+ res4 = CLIP_UNSIGNED_CHAR_H(res4);
+ res5 = CLIP_UNSIGNED_CHAR_H(res5);
+ res4 = (v8i16)__msa_pckev_b((v16i8)res4, (v16i8)res4);
+ res5 = (v8i16)__msa_pckev_b((v16i8)res5, (v16i8)res5);
+ STORE_DWORD(dest + 3 * dest_stride, __msa_copy_u_d((v2i64)res4, 0));
+ STORE_DWORD(dest + 12 * dest_stride, __msa_copy_u_d((v2i64)res5, 0));
+
+ VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ out12 = __msa_srari_h(out12, 6);
+ out13 = __msa_srari_h(out13, 6);
+ dest12 = LOAD_UB(dest + 2 * dest_stride);
+ dest13 = LOAD_UB(dest + 13 * dest_stride);
+ res12 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest12);
+ res13 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest13);
+ res12 += out12;
+ res13 += out13;
+ res12 = CLIP_UNSIGNED_CHAR_H(res12);
+ res13 = CLIP_UNSIGNED_CHAR_H(res13);
+ res12 = (v8i16)__msa_pckev_b((v16i8)res12, (v16i8)res12);
+ res13 = (v8i16)__msa_pckev_b((v16i8)res13, (v16i8)res13);
+ STORE_DWORD(dest + 2 * dest_stride, __msa_copy_u_d((v2i64)res12, 0));
+ STORE_DWORD(dest + 13 * dest_stride, __msa_copy_u_d((v2i64)res13, 0));
+
+ k0 = SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+ k3 = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+ VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ out6 = __msa_srari_h(out6, 6);
+ out7 = __msa_srari_h(out7, 6);
+ dest6 = LOAD_UB(dest + 4 * dest_stride);
+ dest7 = LOAD_UB(dest + 11 * dest_stride);
+ res6 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest6);
+ res7 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest7);
+ res6 += out6;
+ res7 += out7;
+ res6 = CLIP_UNSIGNED_CHAR_H(res6);
+ res7 = CLIP_UNSIGNED_CHAR_H(res7);
+ res6 = (v8i16)__msa_pckev_b((v16i8)res6, (v16i8)res6);
+ res7 = (v8i16)__msa_pckev_b((v16i8)res7, (v16i8)res7);
+ STORE_DWORD(dest + 4 * dest_stride, __msa_copy_u_d((v2i64)res6, 0));
+ STORE_DWORD(dest + 11 * dest_stride, __msa_copy_u_d((v2i64)res7, 0));
+
+ VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ out10 = __msa_srari_h(out10, 6);
+ out11 = __msa_srari_h(out11, 6);
+ dest10 = LOAD_UB(dest + 6 * dest_stride);
+ dest11 = LOAD_UB(dest + 9 * dest_stride);
+ res10 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest10);
+ res11 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest11);
+ res10 += out10;
+ res11 += out11;
+ res10 = CLIP_UNSIGNED_CHAR_H(res10);
+ res11 = CLIP_UNSIGNED_CHAR_H(res11);
+ res10 = (v8i16)__msa_pckev_b((v16i8)res10, (v16i8)res10);
+ res11 = (v8i16)__msa_pckev_b((v16i8)res11, (v16i8)res11);
+ STORE_DWORD(dest + 6 * dest_stride, __msa_copy_u_d((v2i64)res10, 0));
+ STORE_DWORD(dest + 9 * dest_stride, __msa_copy_u_d((v2i64)res11, 0));
+
+ k1 = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+ k2 = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+ VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ out2 = __msa_srari_h(out2, 6);
+ out3 = __msa_srari_h(out3, 6);
+ dest2 = LOAD_UB(dest + 7 * dest_stride);
+ dest3 = LOAD_UB(dest + 8 * dest_stride);
+ res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2);
+ res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3);
+ res2 += out2;
+ res3 += out3;
+ res2 = CLIP_UNSIGNED_CHAR_H(res2);
+ res3 = CLIP_UNSIGNED_CHAR_H(res3);
+ res2 = (v8i16)__msa_pckev_b((v16i8)res2, (v16i8)res2);
+ res3 = (v8i16)__msa_pckev_b((v16i8)res3, (v16i8)res3);
+ STORE_DWORD(dest + 7 * dest_stride, __msa_copy_u_d((v2i64)res2, 0));
+ STORE_DWORD(dest + 8 * dest_stride, __msa_copy_u_d((v2i64)res3, 0));
+
+ VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ out14 = __msa_srari_h(out14, 6);
+ out15 = __msa_srari_h(out15, 6);
+ dest14 = LOAD_UB(dest + 5 * dest_stride);
+ dest15 = LOAD_UB(dest + 10 * dest_stride);
+ res14 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest14);
+ res15 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest15);
+ res14 += out14;
+ res15 += out15;
+ res14 = CLIP_UNSIGNED_CHAR_H(res14);
+ res15 = CLIP_UNSIGNED_CHAR_H(res15);
+ res14 = (v8i16)__msa_pckev_b((v16i8)res14, (v16i8)res14);
+ res15 = (v8i16)__msa_pckev_b((v16i8)res15, (v16i8)res15);
+ STORE_DWORD(dest + 5 * dest_stride, __msa_copy_u_d((v2i64)res14, 0));
+ STORE_DWORD(dest + 10 * dest_stride, __msa_copy_u_d((v2i64)res15, 0));
+}
+
+void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride, int32_t tx_type) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *out_ptr = &out[0];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+ (dest + (i << 3)), dest_stride);
+ }
+ break;
+ case ADST_DCT:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+ (dest + (i << 3)), dest_stride);
+ }
+ break;
+ case DCT_ADST:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+ (dest + (i << 3)), dest_stride);
+ }
+ break;
+ case ADST_ADST:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+ (dest + (i << 3)), dest_stride);
+ }
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c
new file mode 100644
index 00000000000..f576b50ea07
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c
@@ -0,0 +1,1077 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) { \
+ v8i16 k0_m = __msa_fill_h(const0); \
+ v8i16 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = __msa_fill_h(const1); \
+ k0_m = __msa_ilvev_h(s0_m, k0_m); \
+ \
+ s0_m = __msa_ilvl_h(-reg1, reg0); \
+ s1_m = __msa_ilvr_h(-reg1, reg0); \
+ s2_m = __msa_ilvl_h(reg0, reg1); \
+ s3_m = __msa_ilvr_h(reg0, reg1); \
+ s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m); \
+ s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m); \
+ s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \
+ s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h(s0_m, s1_m); \
+ \
+ s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m); \
+ s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m); \
+ s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \
+ s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h(s0_m, s1_m); \
+}
+
+#define VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS(dest, dest_stride, \
+ in0, in1, in2, in3) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ v16u8 dest0_m, dest1_m, dest2_m, dest3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ v16i8 zero_m = { 0 }; \
+ uint8_t *dst_m = (uint8_t *)(dest); \
+ \
+ dest0_m = LOAD_UB(dst_m); \
+ dest1_m = LOAD_UB(dst_m + 4 * dest_stride); \
+ dest2_m = LOAD_UB(dst_m + 8 * dest_stride); \
+ dest3_m = LOAD_UB(dst_m + 12 * dest_stride); \
+ \
+ res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m); \
+ res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m); \
+ res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m); \
+ res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m); \
+ \
+ res0_m += (v8i16)(in0); \
+ res1_m += (v8i16)(in1); \
+ res2_m += (v8i16)(in2); \
+ res3_m += (v8i16)(in3); \
+ \
+ res0_m = CLIP_UNSIGNED_CHAR_H(res0_m); \
+ res1_m = CLIP_UNSIGNED_CHAR_H(res1_m); \
+ res2_m = CLIP_UNSIGNED_CHAR_H(res2_m); \
+ res3_m = CLIP_UNSIGNED_CHAR_H(res3_m); \
+ \
+ tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m); \
+ tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
+ out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
+ out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
+ out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
+ \
+ STORE_DWORD(dst_m, out0_m); \
+ dst_m += (4 * dest_stride); \
+ STORE_DWORD(dst_m, out1_m); \
+ dst_m += (4 * dest_stride); \
+ STORE_DWORD(dst_m, out2_m); \
+ dst_m += (4 * dest_stride); \
+ STORE_DWORD(dst_m, out3_m); \
+}
+
+static void vp9_idct32x8_row_transpose_store(const int16_t *input,
+ int16_t *tmp_buf) {
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7;
+ v8i16 n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* 1st & 2nd 8x8 */
+ LOAD_8VECS_SH(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
+ LOAD_8VECS_SH((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+ m0, n0, m1, n1, m2, n2, m3, n3);
+ TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+ m4, n4, m5, n5, m6, n6, m7, n7);
+ STORE_4VECS_SH((tmp_buf), 8, m0, n0, m1, n1);
+ STORE_4VECS_SH((tmp_buf + 4 * 8), 8, m2, n2, m3, n3);
+ STORE_4VECS_SH((tmp_buf + 8 * 8), 8, m4, n4, m5, n5);
+ STORE_4VECS_SH((tmp_buf + 12 * 8), 8, m6, n6, m7, n7);
+
+ /* 3rd & 4th 8x8 */
+ LOAD_8VECS_SH((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
+ LOAD_8VECS_SH((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+ m0, n0, m1, n1, m2, n2, m3, n3);
+ TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+ m4, n4, m5, n5, m6, n6, m7, n7);
+ STORE_4VECS_SH((tmp_buf + 16 * 8), 8, m0, n0, m1, n1);
+ STORE_4VECS_SH((tmp_buf + 20 * 8), 8, m2, n2, m3, n3);
+ STORE_4VECS_SH((tmp_buf + 24 * 8), 8, m4, n4, m5, n5);
+ STORE_4VECS_SH((tmp_buf + 28 * 8), 8, m6, n6, m7, n7);
+}
+
+static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+ /* Even stage 1 */
+ LOAD_8VECS_SH(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+
+ vec0 = reg1 - reg5;
+ vec1 = reg1 + reg5;
+ vec2 = reg7 - reg3;
+ vec3 = reg7 + reg3;
+
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+
+ vec0 = reg4 - reg6;
+ vec1 = reg4 + reg6;
+ vec2 = reg0 - reg2;
+ vec3 = reg0 + reg2;
+
+ stp4 = vec0 - loc0;
+ stp3 = vec0 + loc0;
+ stp7 = vec1 - loc1;
+ stp0 = vec1 + loc1;
+ stp5 = vec2 - loc2;
+ stp2 = vec2 + loc2;
+ stp6 = vec3 - loc3;
+ stp1 = vec3 + loc3;
+
+ /* Even stage 2 */
+ LOAD_8VECS_SH((tmp_buf + 16), 32,
+ reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = reg0 + reg4;
+ reg0 = reg0 - reg4;
+ reg4 = reg6 + reg2;
+ reg6 = reg6 - reg2;
+ reg2 = reg1 + reg5;
+ reg1 = reg1 - reg5;
+ reg5 = reg7 + reg3;
+ reg7 = reg7 - reg3;
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = reg3 + reg4;
+ reg3 = reg3 - reg4;
+ reg4 = reg5 - vec1;
+ reg5 = reg5 + vec1;
+
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = reg0 - reg6;
+ reg0 = reg0 + reg6;
+ vec1 = reg7 - reg1;
+ reg7 = reg7 + reg1;
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ loc0 = stp0 - reg5;
+ loc1 = stp0 + reg5;
+ loc2 = stp1 - reg7;
+ loc3 = stp1 + reg7;
+ STORE_SH(loc0, (tmp_eve_buf + 15 * 8));
+ STORE_SH(loc1, (tmp_eve_buf));
+ STORE_SH(loc2, (tmp_eve_buf + 14 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 8));
+
+ loc0 = stp2 - reg1;
+ loc1 = stp2 + reg1;
+ loc2 = stp3 - reg4;
+ loc3 = stp3 + reg4;
+ STORE_SH(loc0, (tmp_eve_buf + 13 * 8));
+ STORE_SH(loc1, (tmp_eve_buf + 2 * 8));
+ STORE_SH(loc2, (tmp_eve_buf + 12 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+ /* Store 8 */
+ loc0 = stp4 - reg3;
+ loc1 = stp4 + reg3;
+ loc2 = stp5 - reg6;
+ loc3 = stp5 + reg6;
+ STORE_SH(loc0, (tmp_eve_buf + 11 * 8));
+ STORE_SH(loc1, (tmp_eve_buf + 4 * 8));
+ STORE_SH(loc2, (tmp_eve_buf + 10 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+ loc0 = stp6 - reg0;
+ loc1 = stp6 + reg0;
+ loc2 = stp7 - reg2;
+ loc3 = stp7 + reg2;
+ STORE_SH(loc0, (tmp_eve_buf + 9 * 8));
+ STORE_SH(loc1, (tmp_eve_buf + 6 * 8));
+ STORE_SH(loc2, (tmp_eve_buf + 8 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ reg0 = LOAD_SH(tmp_buf + 8);
+ reg1 = LOAD_SH(tmp_buf + 7 * 8);
+ reg2 = LOAD_SH(tmp_buf + 9 * 8);
+ reg3 = LOAD_SH(tmp_buf + 15 * 8);
+ reg4 = LOAD_SH(tmp_buf + 17 * 8);
+ reg5 = LOAD_SH(tmp_buf + 23 * 8);
+ reg6 = LOAD_SH(tmp_buf + 25 * 8);
+ reg7 = LOAD_SH(tmp_buf + 31 * 8);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = reg0 + reg3;
+ reg0 = reg0 - reg3;
+ reg3 = reg7 + reg4;
+ reg7 = reg7 - reg4;
+ reg4 = reg1 + reg2;
+ reg1 = reg1 - reg2;
+ reg2 = reg6 + reg5;
+ reg6 = reg6 - reg5;
+ reg5 = vec0;
+
+ /* 4 Stores */
+ vec0 = reg5 + reg4;
+ vec1 = reg3 + reg2;
+ STORE_SH(vec0, (tmp_odd_buf + 4 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 5 * 8));
+
+ vec0 = reg5 - reg4;
+ vec1 = reg3 - reg2;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ STORE_SH(vec0, (tmp_odd_buf));
+ STORE_SH(vec1, (tmp_odd_buf + 8));
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+
+ vec0 = reg0 + reg1;
+ vec2 = reg7 - reg6;
+ vec1 = reg7 + reg6;
+ vec3 = reg0 - reg1;
+ STORE_SH(vec0, (tmp_odd_buf + 6 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 7 * 8));
+
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ STORE_SH(vec2, (tmp_odd_buf + 2 * 8));
+ STORE_SH(vec3, (tmp_odd_buf + 3 * 8));
+
+ /* Odd stage 2 */
+
+ /* 8 loads */
+ reg0 = LOAD_SH(tmp_buf + 3 * 8);
+ reg1 = LOAD_SH(tmp_buf + 5 * 8);
+ reg2 = LOAD_SH(tmp_buf + 11 * 8);
+ reg3 = LOAD_SH(tmp_buf + 13 * 8);
+ reg4 = LOAD_SH(tmp_buf + 19 * 8);
+ reg5 = LOAD_SH(tmp_buf + 21 * 8);
+ reg6 = LOAD_SH(tmp_buf + 27 * 8);
+ reg7 = LOAD_SH(tmp_buf + 29 * 8);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ vec0 = reg1 - reg2;
+ vec1 = reg6 - reg5;
+ vec2 = reg0 - reg3;
+ vec3 = reg7 - reg4;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+ vec2 = loc2 - loc0;
+ vec3 = loc3 - loc1;
+ vec0 = loc2 + loc0;
+ vec1 = loc3 + loc1;
+ STORE_SH(vec0, (tmp_odd_buf + 12 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 15 * 8));
+
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+
+ STORE_SH(vec0, (tmp_odd_buf + 10 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 11 * 8));
+
+ /* 4 Stores */
+ vec0 = reg0 + reg3;
+ vec1 = reg1 + reg2;
+ vec2 = reg6 + reg5;
+ vec3 = reg7 + reg4;
+ reg0 = vec0 + vec1;
+ reg1 = vec3 + vec2;
+ reg2 = vec0 - vec1;
+ reg3 = vec3 - vec2;
+ STORE_SH(reg0, (tmp_odd_buf + 13 * 8));
+ STORE_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+
+ STORE_SH(reg0, (tmp_odd_buf + 8 * 8));
+ STORE_SH(reg1, (tmp_odd_buf + 9 * 8));
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+ /* Load 8 & Store 8 */
+ reg0 = LOAD_SH(tmp_odd_buf);
+ reg1 = LOAD_SH(tmp_odd_buf + 1 * 8);
+ reg2 = LOAD_SH(tmp_odd_buf + 2 * 8);
+ reg3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+ reg4 = LOAD_SH(tmp_odd_buf + 8 * 8);
+ reg5 = LOAD_SH(tmp_odd_buf + 9 * 8);
+ reg6 = LOAD_SH(tmp_odd_buf + 10 * 8);
+ reg7 = LOAD_SH(tmp_odd_buf + 11 * 8);
+
+ loc0 = reg0 + reg4;
+ loc1 = reg1 + reg5;
+ loc2 = reg2 + reg6;
+ loc3 = reg3 + reg7;
+ STORE_SH(loc0, (tmp_odd_buf));
+ STORE_SH(loc1, (tmp_odd_buf + 1 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 2 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 3 * 8));
+
+ vec0 = reg0 - reg4;
+ vec1 = reg1 - reg5;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ vec0 = reg2 - reg6;
+ vec1 = reg3 - reg7;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ STORE_SH(loc0, (tmp_odd_buf + 8 * 8));
+ STORE_SH(loc1, (tmp_odd_buf + 9 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 10 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 11 * 8));
+
+ /* Load 8 & Store 8 */
+ reg1 = LOAD_SH(tmp_odd_buf + 4 * 8);
+ reg2 = LOAD_SH(tmp_odd_buf + 5 * 8);
+ reg0 = LOAD_SH(tmp_odd_buf + 6 * 8);
+ reg3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+ reg4 = LOAD_SH(tmp_odd_buf + 12 * 8);
+ reg5 = LOAD_SH(tmp_odd_buf + 13 * 8);
+ reg6 = LOAD_SH(tmp_odd_buf + 14 * 8);
+ reg7 = LOAD_SH(tmp_odd_buf + 15 * 8);
+
+ loc0 = reg0 + reg4;
+ loc1 = reg1 + reg5;
+ loc2 = reg2 + reg6;
+ loc3 = reg3 + reg7;
+ STORE_SH(loc0, (tmp_odd_buf + 4 * 8));
+ STORE_SH(loc1, (tmp_odd_buf + 5 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 6 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 7 * 8));
+
+ vec0 = reg0 - reg4;
+ vec1 = reg3 - reg7;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ vec0 = reg1 - reg5;
+ vec1 = reg2 - reg6;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ STORE_SH(loc0, (tmp_odd_buf + 12 * 8));
+ STORE_SH(loc1, (tmp_odd_buf + 13 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 14 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 15 * 8));
+}
+
+static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf,
+ int16_t *dest) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7;
+ v8i16 n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ /* Total: 32 loads, 32 stores */
+ vec0 = LOAD_SH(tmp_odd_buf);
+ vec1 = LOAD_SH(tmp_odd_buf + 9 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 14 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 6 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf);
+ loc1 = LOAD_SH(tmp_eve_buf + 8 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 4 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 12 * 8);
+
+ m0 = (loc0 + vec3);
+ STORE_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+ STORE_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+ m4 = (loc1 + vec2);
+ STORE_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+ m2 = (loc2 + vec1);
+ STORE_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+ m6 = (loc3 + vec0);
+
+ /* Load 8 & Store 8 */
+ vec0 = LOAD_SH(tmp_odd_buf + 4 * 8);
+ vec1 = LOAD_SH(tmp_odd_buf + 13 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 10 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf + 2 * 8);
+ loc1 = LOAD_SH(tmp_eve_buf + 10 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 6 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 14 * 8);
+
+ m1 = (loc0 + vec3);
+ STORE_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+ STORE_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+ m5 = (loc1 + vec2);
+ STORE_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+ m3 = (loc2 + vec1);
+ STORE_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+ m7 = (loc3 + vec0);
+
+ /* Load 8 & Store 8 */
+ vec0 = LOAD_SH(tmp_odd_buf + 2 * 8);
+ vec1 = LOAD_SH(tmp_odd_buf + 11 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 12 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf + 1 * 8);
+ loc1 = LOAD_SH(tmp_eve_buf + 9 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 5 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 13 * 8);
+
+ n0 = (loc0 + vec3);
+ STORE_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+ STORE_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+ n4 = (loc1 + vec2);
+ STORE_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+ n2 = (loc2 + vec1);
+ STORE_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+ n6 = (loc3 + vec0);
+
+ /* Load 8 & Store 8 */
+ vec0 = LOAD_SH(tmp_odd_buf + 5 * 8);
+ vec1 = LOAD_SH(tmp_odd_buf + 15 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 8 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 1 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf + 3 * 8);
+ loc1 = LOAD_SH(tmp_eve_buf + 11 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 7 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 15 * 8);
+
+ n1 = (loc0 + vec3);
+ STORE_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+ STORE_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+ n5 = (loc1 + vec2);
+ STORE_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+ n3 = (loc2 + vec1);
+ STORE_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+ n7 = (loc3 + vec0);
+
+ /* Transpose : 16 vectors */
+ /* 1st & 2nd 8x8 */
+ TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+ m0, n0, m1, n1, m2, n2, m3, n3);
+ STORE_4VECS_SH((dest + 0), 32, m0, n0, m1, n1);
+ STORE_4VECS_SH((dest + 4 * 32), 32, m2, n2, m3, n3);
+
+ TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+ m4, n4, m5, n5, m6, n6, m7, n7);
+ STORE_4VECS_SH((dest + 8), 32, m4, n4, m5, n5);
+ STORE_4VECS_SH((dest + 8 + 4 * 32), 32, m6, n6, m7, n7);
+
+ /* 3rd & 4th 8x8 */
+ LOAD_8VECS_SH((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+ LOAD_8VECS_SH((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+ TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+ m0, n0, m1, n1, m2, n2, m3, n3);
+ STORE_4VECS_SH((dest + 16), 32, m0, n0, m1, n1);
+ STORE_4VECS_SH((dest + 16 + 4 * 32), 32, m2, n2, m3, n3);
+
+ TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+ m4, n4, m5, n5, m6, n6, m7, n7);
+ STORE_4VECS_SH((dest + 24), 32, m4, n4, m5, n5);
+ STORE_4VECS_SH((dest + 24 + 4 * 32), 32, m6, n6, m7, n7);
+}
+
+static void vp9_idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ vp9_idct32x8_row_transpose_store(input, &tmp_buf[0]);
+
+ vp9_idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+
+ vp9_idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+
+ vp9_idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
+ &tmp_odd_buf[0], output);
+}
+
+static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+ /* Even stage 1 */
+ LOAD_8VECS_SH(tmp_buf, (4 * 32),
+ reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+
+ vec0 = reg1 - reg5;
+ vec1 = reg1 + reg5;
+ vec2 = reg7 - reg3;
+ vec3 = reg7 + reg3;
+
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+
+ vec0 = reg4 - reg6;
+ vec1 = reg4 + reg6;
+ vec2 = reg0 - reg2;
+ vec3 = reg0 + reg2;
+
+ stp4 = vec0 - loc0;
+ stp3 = vec0 + loc0;
+ stp7 = vec1 - loc1;
+ stp0 = vec1 + loc1;
+ stp5 = vec2 - loc2;
+ stp2 = vec2 + loc2;
+ stp6 = vec3 - loc3;
+ stp1 = vec3 + loc3;
+
+ /* Even stage 2 */
+ /* Load 8 */
+ LOAD_8VECS_SH((tmp_buf + 2 * 32), (4 * 32),
+ reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = reg0 + reg4;
+ reg0 = reg0 - reg4;
+ reg4 = reg6 + reg2;
+ reg6 = reg6 - reg2;
+ reg2 = reg1 + reg5;
+ reg1 = reg1 - reg5;
+ reg5 = reg7 + reg3;
+ reg7 = reg7 - reg3;
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = reg3 + reg4;
+ reg3 = reg3 - reg4;
+ reg4 = reg5 - vec1;
+ reg5 = reg5 + vec1;
+
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = reg0 - reg6;
+ reg0 = reg0 + reg6;
+ vec1 = reg7 - reg1;
+ reg7 = reg7 + reg1;
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ /* Store 8 */
+ loc0 = stp0 - reg5;
+ loc1 = stp0 + reg5;
+ loc2 = stp1 - reg7;
+ loc3 = stp1 + reg7;
+ STORE_SH(loc0, (tmp_eve_buf + 15 * 8));
+ STORE_SH(loc1, (tmp_eve_buf));
+ STORE_SH(loc2, (tmp_eve_buf + 14 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 1 * 8));
+
+ loc0 = stp2 - reg1;
+ loc1 = stp2 + reg1;
+ loc2 = stp3 - reg4;
+ loc3 = stp3 + reg4;
+ STORE_SH(loc0, (tmp_eve_buf + 13 * 8));
+ STORE_SH(loc1, (tmp_eve_buf + 2 * 8));
+ STORE_SH(loc2, (tmp_eve_buf + 12 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+ /* Store 8 */
+ loc0 = stp4 - reg3;
+ loc1 = stp4 + reg3;
+ loc2 = stp5 - reg6;
+ loc3 = stp5 + reg6;
+ STORE_SH(loc0, (tmp_eve_buf + 11 * 8));
+ STORE_SH(loc1, (tmp_eve_buf + 4 * 8));
+ STORE_SH(loc2, (tmp_eve_buf + 10 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+ loc0 = stp6 - reg0;
+ loc1 = stp6 + reg0;
+ loc2 = stp7 - reg2;
+ loc3 = stp7 + reg2;
+ STORE_SH(loc0, (tmp_eve_buf + 9 * 8));
+ STORE_SH(loc1, (tmp_eve_buf + 6 * 8));
+ STORE_SH(loc2, (tmp_eve_buf + 8 * 8));
+ STORE_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ reg0 = LOAD_SH(tmp_buf + 32);
+ reg1 = LOAD_SH(tmp_buf + 7 * 32);
+ reg2 = LOAD_SH(tmp_buf + 9 * 32);
+ reg3 = LOAD_SH(tmp_buf + 15 * 32);
+ reg4 = LOAD_SH(tmp_buf + 17 * 32);
+ reg5 = LOAD_SH(tmp_buf + 23 * 32);
+ reg6 = LOAD_SH(tmp_buf + 25 * 32);
+ reg7 = LOAD_SH(tmp_buf + 31 * 32);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = reg0 + reg3;
+ reg0 = reg0 - reg3;
+ reg3 = reg7 + reg4;
+ reg7 = reg7 - reg4;
+ reg4 = reg1 + reg2;
+ reg1 = reg1 - reg2;
+ reg2 = reg6 + reg5;
+ reg6 = reg6 - reg5;
+ reg5 = vec0;
+
+ /* 4 Stores */
+ vec0 = reg5 + reg4;
+ vec1 = reg3 + reg2;
+ STORE_SH(vec0, (tmp_odd_buf + 4 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 5 * 8));
+
+ vec0 = reg5 - reg4;
+ vec1 = reg3 - reg2;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ STORE_SH(vec0, (tmp_odd_buf));
+ STORE_SH(vec1, (tmp_odd_buf + 1 * 8));
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+
+ vec0 = reg0 + reg1;
+ vec2 = reg7 - reg6;
+ vec1 = reg7 + reg6;
+ vec3 = reg0 - reg1;
+ STORE_SH(vec0, (tmp_odd_buf + 6 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 7 * 8));
+
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ STORE_SH(vec2, (tmp_odd_buf + 2 * 8));
+ STORE_SH(vec3, (tmp_odd_buf + 3 * 8));
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ reg0 = LOAD_SH(tmp_buf + 3 * 32);
+ reg1 = LOAD_SH(tmp_buf + 5 * 32);
+ reg2 = LOAD_SH(tmp_buf + 11 * 32);
+ reg3 = LOAD_SH(tmp_buf + 13 * 32);
+ reg4 = LOAD_SH(tmp_buf + 19 * 32);
+ reg5 = LOAD_SH(tmp_buf + 21 * 32);
+ reg6 = LOAD_SH(tmp_buf + 27 * 32);
+ reg7 = LOAD_SH(tmp_buf + 29 * 32);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ vec0 = reg1 - reg2;
+ vec1 = reg6 - reg5;
+ vec2 = reg0 - reg3;
+ vec3 = reg7 - reg4;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+ vec2 = loc2 - loc0;
+ vec3 = loc3 - loc1;
+ vec0 = loc2 + loc0;
+ vec1 = loc3 + loc1;
+ STORE_SH(vec0, (tmp_odd_buf + 12 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 15 * 8));
+
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+
+ STORE_SH(vec0, (tmp_odd_buf + 10 * 8));
+ STORE_SH(vec1, (tmp_odd_buf + 11 * 8));
+
+ /* 4 Stores */
+ vec0 = reg0 + reg3;
+ vec1 = reg1 + reg2;
+ vec2 = reg6 + reg5;
+ vec3 = reg7 + reg4;
+ reg0 = vec0 + vec1;
+ reg1 = vec3 + vec2;
+ reg2 = vec0 - vec1;
+ reg3 = vec3 - vec2;
+ STORE_SH(reg0, (tmp_odd_buf + 13 * 8));
+ STORE_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+
+ STORE_SH(reg0, (tmp_odd_buf + 8 * 8));
+ STORE_SH(reg1, (tmp_odd_buf + 9 * 8));
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+ /* Load 8 & Store 8 */
+ reg0 = LOAD_SH(tmp_odd_buf);
+ reg1 = LOAD_SH(tmp_odd_buf + 1 * 8);
+ reg2 = LOAD_SH(tmp_odd_buf + 2 * 8);
+ reg3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+ reg4 = LOAD_SH(tmp_odd_buf + 8 * 8);
+ reg5 = LOAD_SH(tmp_odd_buf + 9 * 8);
+ reg6 = LOAD_SH(tmp_odd_buf + 10 * 8);
+ reg7 = LOAD_SH(tmp_odd_buf + 11 * 8);
+
+ loc0 = reg0 + reg4;
+ loc1 = reg1 + reg5;
+ loc2 = reg2 + reg6;
+ loc3 = reg3 + reg7;
+ STORE_SH(loc0, (tmp_odd_buf));
+ STORE_SH(loc1, (tmp_odd_buf + 1 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 2 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 3 * 8));
+
+ vec0 = reg0 - reg4;
+ vec1 = reg1 - reg5;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ vec0 = reg2 - reg6;
+ vec1 = reg3 - reg7;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ STORE_SH(loc0, (tmp_odd_buf + 8 * 8));
+ STORE_SH(loc1, (tmp_odd_buf + 9 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 10 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 11 * 8));
+
+ /* Load 8 & Store 8 */
+ reg1 = LOAD_SH(tmp_odd_buf + 4 * 8);
+ reg2 = LOAD_SH(tmp_odd_buf + 5 * 8);
+ reg0 = LOAD_SH(tmp_odd_buf + 6 * 8);
+ reg3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+ reg4 = LOAD_SH(tmp_odd_buf + 12 * 8);
+ reg5 = LOAD_SH(tmp_odd_buf + 13 * 8);
+ reg6 = LOAD_SH(tmp_odd_buf + 14 * 8);
+ reg7 = LOAD_SH(tmp_odd_buf + 15 * 8);
+
+ loc0 = reg0 + reg4;
+ loc1 = reg1 + reg5;
+ loc2 = reg2 + reg6;
+ loc3 = reg3 + reg7;
+ STORE_SH(loc0, (tmp_odd_buf + 4 * 8));
+ STORE_SH(loc1, (tmp_odd_buf + 5 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 6 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 7 * 8));
+
+ vec0 = reg0 - reg4;
+ vec1 = reg3 - reg7;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ vec0 = reg1 - reg5;
+ vec1 = reg2 - reg6;
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ STORE_SH(loc0, (tmp_odd_buf + 12 * 8));
+ STORE_SH(loc1, (tmp_odd_buf + 13 * 8));
+ STORE_SH(loc2, (tmp_odd_buf + 14 * 8));
+ STORE_SH(loc3, (tmp_odd_buf + 15 * 8));
+}
+
+static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf,
+ uint8_t *dest,
+ int32_t dest_stride) {
+ v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ v8i16 m0, m1, m2, m3, m4, m5, m6, m7;
+ v8i16 n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ vec0 = LOAD_SH(tmp_odd_buf);
+ vec1 = LOAD_SH(tmp_odd_buf + 9 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 14 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 6 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf);
+ loc1 = LOAD_SH(tmp_eve_buf + 8 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 4 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 12 * 8);
+
+ m0 = (loc0 + vec3);
+ m4 = (loc1 + vec2);
+ m2 = (loc2 + vec1);
+ m6 = (loc3 + vec0);
+ SRARI_H_4VECS_SH(m0, m2, m4, m6, m0, m2, m4, m6, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS(dest, dest_stride, m0, m2, m4, m6);
+
+ m6 = (loc0 - vec3);
+ m2 = (loc1 - vec2);
+ m4 = (loc2 - vec1);
+ m0 = (loc3 - vec0);
+ SRARI_H_4VECS_SH(m0, m2, m4, m6, m0, m2, m4, m6, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 19 * dest_stride),
+ dest_stride, m0, m2, m4, m6);
+
+ /* Load 8 & Store 8 */
+ vec0 = LOAD_SH(tmp_odd_buf + 4 * 8);
+ vec1 = LOAD_SH(tmp_odd_buf + 13 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 10 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf + 2 * 8);
+ loc1 = LOAD_SH(tmp_eve_buf + 10 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 6 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 14 * 8);
+
+ m1 = (loc0 + vec3);
+ m5 = (loc1 + vec2);
+ m3 = (loc2 + vec1);
+ m7 = (loc3 + vec0);
+ SRARI_H_4VECS_SH(m1, m3, m5, m7, m1, m3, m5, m7, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 2 * dest_stride),
+ dest_stride, m1, m3, m5, m7);
+
+ m7 = (loc0 - vec3);
+ m3 = (loc1 - vec2);
+ m5 = (loc2 - vec1);
+ m1 = (loc3 - vec0);
+ SRARI_H_4VECS_SH(m1, m3, m5, m7, m1, m3, m5, m7, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 17 * dest_stride),
+ dest_stride, m1, m3, m5, m7);
+
+ /* Load 8 & Store 8 */
+ vec0 = LOAD_SH(tmp_odd_buf + 2 * 8);
+ vec1 = LOAD_SH(tmp_odd_buf + 11 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 12 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf + 1 * 8);
+ loc1 = LOAD_SH(tmp_eve_buf + 9 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 5 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 13 * 8);
+
+ n0 = (loc0 + vec3);
+ n4 = (loc1 + vec2);
+ n2 = (loc2 + vec1);
+ n6 = (loc3 + vec0);
+ SRARI_H_4VECS_SH(n0, n2, n4, n6, n0, n2, n4, n6, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 1 * dest_stride),
+ dest_stride, n0, n2, n4, n6);
+
+ n6 = (loc0 - vec3);
+ n2 = (loc1 - vec2);
+ n4 = (loc2 - vec1);
+ n0 = (loc3 - vec0);
+ SRARI_H_4VECS_SH(n0, n2, n4, n6, n0, n2, n4, n6, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 18 * dest_stride),
+ dest_stride, n0, n2, n4, n6);
+
+ /* Load 8 & Store 8 */
+ vec0 = LOAD_SH(tmp_odd_buf + 5 * 8);
+ vec1 = LOAD_SH(tmp_odd_buf + 15 * 8);
+ vec2 = LOAD_SH(tmp_odd_buf + 8 * 8);
+ vec3 = LOAD_SH(tmp_odd_buf + 1 * 8);
+ loc0 = LOAD_SH(tmp_eve_buf + 3 * 8);
+ loc1 = LOAD_SH(tmp_eve_buf + 11 * 8);
+ loc2 = LOAD_SH(tmp_eve_buf + 7 * 8);
+ loc3 = LOAD_SH(tmp_eve_buf + 15 * 8);
+
+ n1 = (loc0 + vec3);
+ n5 = (loc1 + vec2);
+ n3 = (loc2 + vec1);
+ n7 = (loc3 + vec0);
+ SRARI_H_4VECS_SH(n1, n3, n5, n7, n1, n3, n5, n7, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 3 * dest_stride),
+ dest_stride, n1, n3, n5, n7);
+
+ n7 = (loc0 - vec3);
+ n3 = (loc1 - vec2);
+ n5 = (loc2 - vec1);
+ n1 = (loc3 - vec0);
+ SRARI_H_4VECS_SH(n1, n3, n5, n7, n1, n3, n5, n7, 6);
+ VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 16 * dest_stride),
+ dest_stride, n1, n3, n5, n7);
+}
+
+static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+
+ vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+
+ vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+ dest, dest_stride);
+}
+
+void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 4; ++i) {
+ /* process 32 * 8 block */
+ vp9_idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dest + (i << 3)),
+ dest_stride);
+ }
+}
+
+void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ for (i = 32; i--;) {
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[out_ptr]) \n\t"
+ "sw $zero, 4(%[out_ptr]) \n\t"
+ "sw $zero, 8(%[out_ptr]) \n\t"
+ "sw $zero, 12(%[out_ptr]) \n\t"
+ "sw $zero, 16(%[out_ptr]) \n\t"
+ "sw $zero, 20(%[out_ptr]) \n\t"
+ "sw $zero, 24(%[out_ptr]) \n\t"
+ "sw $zero, 28(%[out_ptr]) \n\t"
+ "sw $zero, 32(%[out_ptr]) \n\t"
+ "sw $zero, 36(%[out_ptr]) \n\t"
+ "sw $zero, 40(%[out_ptr]) \n\t"
+ "sw $zero, 44(%[out_ptr]) \n\t"
+ "sw $zero, 48(%[out_ptr]) \n\t"
+ "sw $zero, 52(%[out_ptr]) \n\t"
+ "sw $zero, 56(%[out_ptr]) \n\t"
+ "sw $zero, 60(%[out_ptr]) \n\t"
+
+ :
+ : [out_ptr] "r" (out_ptr)
+ );
+
+ out_ptr += 32;
+ }
+
+ out_ptr = out_arr;
+
+ /* rows: only upper-left 8x8 has non-zero coeff */
+ vp9_idct32x8_1d_rows_msa(input, out_ptr);
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dest + (i << 3)),
+ dest_stride);
+ }
+}
+
+void vp9_idct32x32_1_add_msa(const int16_t *input, uint8_t *dest,
+ int32_t dest_stride) {
+ int32_t i, const1;
+ v8i16 const2;
+ int16_t out;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v16u8 dest0, dest1, dest2, dest3;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16i8 zero = { 0 };
+
+ out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ const1 = ROUND_POWER_OF_TWO(out, 6);
+
+ const2 = __msa_fill_h(const1);
+
+ for (i = 0; i < 16; ++i) {
+ dest0 = LOAD_UB(dest);
+ dest1 = LOAD_UB(dest + 16);
+ dest2 = LOAD_UB(dest + dest_stride);
+ dest3 = LOAD_UB(dest + dest_stride + 16);
+
+ res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0);
+ res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1);
+ res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2);
+ res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3);
+ res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0);
+ res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1);
+ res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2);
+ res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3);
+
+ res0 += const2;
+ res1 += const2;
+ res2 += const2;
+ res3 += const2;
+ res4 += const2;
+ res5 += const2;
+ res6 += const2;
+ res7 += const2;
+
+ res0 = CLIP_UNSIGNED_CHAR_H(res0);
+ res1 = CLIP_UNSIGNED_CHAR_H(res1);
+ res2 = CLIP_UNSIGNED_CHAR_H(res2);
+ res3 = CLIP_UNSIGNED_CHAR_H(res3);
+ res4 = CLIP_UNSIGNED_CHAR_H(res4);
+ res5 = CLIP_UNSIGNED_CHAR_H(res5);
+ res6 = CLIP_UNSIGNED_CHAR_H(res6);
+ res7 = CLIP_UNSIGNED_CHAR_H(res7);
+
+ tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0);
+ tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1);
+ tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2);
+ tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3);
+
+ STORE_UB(tmp0, dest);
+ STORE_UB(tmp1, dest + 16);
+ dest += dest_stride;
+ STORE_UB(tmp2, dest);
+ STORE_UB(tmp3, dest + 16);
+ dest += dest_stride;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h
new file mode 100644
index 00000000000..d7aabbb8898
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h
@@ -0,0 +1,867 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
+#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MSA
+/* load macros */
+#define LOAD_UB(psrc) *((const v16u8 *)(psrc))
+#define LOAD_SB(psrc) *((const v16i8 *)(psrc))
+#define LOAD_UH(psrc) *((const v8u16 *)(psrc))
+#define LOAD_SH(psrc) *((const v8i16 *)(psrc))
+#define LOAD_UW(psrc) *((const v4u32 *)(psrc))
+#define LOAD_SW(psrc) *((const v4i32 *)(psrc))
+#define LOAD_UD(psrc) *((const v2u64 *)(psrc))
+#define LOAD_SD(psrc) *((const v2i64 *)(psrc))
+
+/* store macros */
+#define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
+#define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
+#define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec)
+#define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec)
+#define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec)
+#define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec)
+#define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec)
+#define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec)
+
+#if (__mips_isa_rev >= 6)
+#define LOAD_WORD(psrc) ({ \
+ const uint8_t *src_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ __asm__ __volatile__ ( \
+ "lw %[val_m], %[src_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [src_m] "m" (*src_m) \
+ ); \
+ \
+ val_m; \
+})
+
+#if (__mips == 64)
+#define LOAD_DWORD(psrc) ({ \
+ const uint8_t *src_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "ld %[val_m], %[src_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [src_m] "m" (*src_m) \
+ ); \
+ \
+ val_m; \
+})
+#else // !(__mips == 64)
+#define LOAD_DWORD(psrc) ({ \
+ const uint8_t *src1_m = (const uint8_t *)(psrc); \
+ const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \
+ uint32_t val0_m, val1_m; \
+ uint64_t genval_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "lw %[val0_m], %[src1_m] \n\t" \
+ \
+ : [val0_m] "=r" (val0_m) \
+ : [src1_m] "m" (*src1_m) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "lw %[val1_m], %[src2_m] \n\t" \
+ \
+ : [val1_m] "=r" (val1_m) \
+ : [src2_m] "m" (*src2_m) \
+ ); \
+ \
+ genval_m = (uint64_t)(val1_m); \
+ genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \
+ genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \
+ \
+ genval_m; \
+})
+#endif // (__mips == 64)
+#define STORE_WORD_WITH_OFFSET_1(pdst, val) { \
+ uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "sw %[val_m], %[dst_ptr_m] \n\t" \
+ \
+ : [dst_ptr_m] "=m" (*dst_ptr_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#define STORE_WORD(pdst, val) { \
+ uint8_t *dst_ptr_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "sw %[val_m], %[dst_ptr_m] \n\t" \
+ \
+ : [dst_ptr_m] "=m" (*dst_ptr_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#define STORE_DWORD(pdst, val) { \
+ uint8_t *dst_ptr_m = (uint8_t *)(pdst); \
+ const uint64_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "sd %[val_m], %[dst_ptr_m] \n\t" \
+ \
+ : [dst_ptr_m] "=m" (*dst_ptr_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+#else // !(__mips_isa_rev >= 6)
+#define LOAD_WORD(psrc) ({ \
+ const uint8_t *src_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ __asm__ __volatile__ ( \
+ "ulw %[val_m], %[src_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [src_m] "m" (*src_m) \
+ ); \
+ \
+ val_m; \
+})
+
+#if (__mips == 64)
+#define LOAD_DWORD(psrc) ({ \
+ const uint8_t *src_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "uld %[val_m], %[src_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [src_m] "m" (*src_m) \
+ ); \
+ \
+ val_m; \
+})
+#else // !(__mips == 64)
+#define LOAD_DWORD(psrc) ({ \
+ const uint8_t *src1_m = (const uint8_t *)(psrc); \
+ const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \
+ uint32_t val0_m, val1_m; \
+ uint64_t genval_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "ulw %[val0_m], %[src1_m] \n\t" \
+ \
+ : [val0_m] "=r" (val0_m) \
+ : [src1_m] "m" (*src1_m) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "ulw %[val1_m], %[src2_m] \n\t" \
+ \
+ : [val1_m] "=r" (val1_m) \
+ : [src2_m] "m" (*src2_m) \
+ ); \
+ \
+ genval_m = (uint64_t)(val1_m); \
+ genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \
+ genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \
+ \
+ genval_m; \
+})
+#endif // (__mips == 64)
+
+#define STORE_WORD_WITH_OFFSET_1(pdst, val) { \
+ uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[val_m], %[dst_ptr_m] \n\t" \
+ \
+ : [dst_ptr_m] "=m" (*dst_ptr_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#define STORE_WORD(pdst, val) { \
+ uint8_t *dst_ptr_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[val_m], %[dst_ptr_m] \n\t" \
+ \
+ : [dst_ptr_m] "=m" (*dst_ptr_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#define STORE_DWORD(pdst, val) { \
+ uint8_t *dst1_m = (uint8_t *)(pdst); \
+ uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[val0_m], %[dst1_m] \n\t" \
+ "usw %[val1_m], %[dst2_m] \n\t" \
+ \
+ : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \
+ : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \
+ ); \
+}
+#endif // (__mips_isa_rev >= 6)
+
+#define LOAD_2VECS_UB(psrc, stride, \
+ val0, val1) { \
+ val0 = LOAD_UB(psrc + 0 * stride); \
+ val1 = LOAD_UB(psrc + 1 * stride); \
+}
+
+#define LOAD_4VECS_UB(psrc, stride, \
+ val0, val1, val2, val3) { \
+ val0 = LOAD_UB(psrc + 0 * stride); \
+ val1 = LOAD_UB(psrc + 1 * stride); \
+ val2 = LOAD_UB(psrc + 2 * stride); \
+ val3 = LOAD_UB(psrc + 3 * stride); \
+}
+
+#define LOAD_4VECS_SB(psrc, stride, \
+ val0, val1, val2, val3) { \
+ val0 = LOAD_SB(psrc + 0 * stride); \
+ val1 = LOAD_SB(psrc + 1 * stride); \
+ val2 = LOAD_SB(psrc + 2 * stride); \
+ val3 = LOAD_SB(psrc + 3 * stride); \
+}
+
+#define LOAD_5VECS_UB(psrc, stride, \
+ out0, out1, out2, out3, out4) { \
+ LOAD_4VECS_UB((psrc), (stride), \
+ (out0), (out1), (out2), (out3)); \
+ out4 = LOAD_UB(psrc + 4 * stride); \
+}
+
+#define LOAD_5VECS_SB(psrc, stride, \
+ out0, out1, out2, out3, out4) { \
+ LOAD_4VECS_SB((psrc), (stride), \
+ (out0), (out1), (out2), (out3)); \
+ out4 = LOAD_SB(psrc + 4 * stride); \
+}
+
+#define LOAD_7VECS_SB(psrc, stride, \
+ val0, val1, val2, val3, \
+ val4, val5, val6) { \
+ val0 = LOAD_SB((psrc) + 0 * (stride)); \
+ val1 = LOAD_SB((psrc) + 1 * (stride)); \
+ val2 = LOAD_SB((psrc) + 2 * (stride)); \
+ val3 = LOAD_SB((psrc) + 3 * (stride)); \
+ val4 = LOAD_SB((psrc) + 4 * (stride)); \
+ val5 = LOAD_SB((psrc) + 5 * (stride)); \
+ val6 = LOAD_SB((psrc) + 6 * (stride)); \
+}
+
+#define LOAD_8VECS_UB(psrc, stride, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7) { \
+ LOAD_4VECS_UB((psrc), (stride), \
+ (out0), (out1), (out2), (out3)); \
+ LOAD_4VECS_UB((psrc + 4 * stride), (stride), \
+ (out4), (out5), (out6), (out7)); \
+}
+
+#define LOAD_8VECS_SB(psrc, stride, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7) { \
+ LOAD_4VECS_SB((psrc), (stride), \
+ (out0), (out1), (out2), (out3)); \
+ LOAD_4VECS_SB((psrc + 4 * stride), (stride), \
+ (out4), (out5), (out6), (out7)); \
+}
+
+#define LOAD_2VECS_SH(psrc, stride, \
+ val0, val1) { \
+ val0 = LOAD_SH((psrc) + 0 * (stride)); \
+ val1 = LOAD_SH((psrc) + 1 * (stride)); \
+}
+
+#define LOAD_4VECS_SH(psrc, stride, \
+ val0, val1, val2, val3) { \
+ LOAD_2VECS_SH((psrc), (stride), val0, val1); \
+ LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \
+}
+
+#define LOAD_8VECS_SH(psrc, stride, \
+ val0, val1, val2, val3, \
+ val4, val5, val6, val7) { \
+ LOAD_4VECS_SH((psrc), (stride), \
+ val0, val1, val2, val3); \
+ LOAD_4VECS_SH((psrc + 4 * stride), (stride), \
+ val4, val5, val6, val7); \
+}
+
+#define LOAD_16VECS_SH(psrc, stride, \
+ val0, val1, val2, val3, \
+ val4, val5, val6, val7, \
+ val8, val9, val10, val11, \
+ val12, val13, val14, val15) { \
+ LOAD_8VECS_SH((psrc), (stride), \
+ val0, val1, val2, val3, \
+ val4, val5, val6, val7); \
+ LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \
+ val8, val9, val10, val11, \
+ val12, val13, val14, val15); \
+}
+
+#define STORE_4VECS_UB(dst_out, pitch, \
+ in0, in1, in2, in3) { \
+ STORE_UB((in0), (dst_out)); \
+ STORE_UB((in1), ((dst_out) + (pitch))); \
+ STORE_UB((in2), ((dst_out) + 2 * (pitch))); \
+ STORE_UB((in3), ((dst_out) + 3 * (pitch))); \
+}
+
+#define STORE_8VECS_UB(dst_out, pitch_in, \
+ in0, in1, in2, in3, \
+ in4, in5, in6, in7) { \
+ STORE_4VECS_UB(dst_out, pitch_in, \
+ in0, in1, in2, in3); \
+ STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \
+ in4, in5, in6, in7); \
+}
+
+#define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \
+ src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \
+ src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \
+ src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \
+ src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \
+}
+
+#define VEC_INSERT_2DW_UB(src, src0, src1) { \
+ src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \
+ src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \
+}
+
+#define STORE_4VECS_SH(ptr, stride, \
+ in0, in1, in2, in3) { \
+ STORE_SH(in0, ((ptr) + 0 * stride)); \
+ STORE_SH(in1, ((ptr) + 1 * stride)); \
+ STORE_SH(in2, ((ptr) + 2 * stride)); \
+ STORE_SH(in3, ((ptr) + 3 * stride)); \
+}
+
+#define STORE_8VECS_SH(ptr, stride, \
+ in0, in1, in2, in3, \
+ in4, in5, in6, in7) { \
+ STORE_SH(in0, ((ptr) + 0 * stride)); \
+ STORE_SH(in1, ((ptr) + 1 * stride)); \
+ STORE_SH(in2, ((ptr) + 2 * stride)); \
+ STORE_SH(in3, ((ptr) + 3 * stride)); \
+ STORE_SH(in4, ((ptr) + 4 * stride)); \
+ STORE_SH(in5, ((ptr) + 5 * stride)); \
+ STORE_SH(in6, ((ptr) + 6 * stride)); \
+ STORE_SH(in7, ((ptr) + 7 * stride)); \
+}
+
+#define CLIP_UNSIGNED_CHAR_H(in) ({ \
+ v8i16 max_m = __msa_ldi_h(255); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h((v8i16)(in), 0); \
+ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+ out_m; \
+})
+
+/* halfword 8x8 transpose macro */
+#define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7) { \
+ v8i16 s0_m, s1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \
+ s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \
+ tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
+ tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
+ \
+ s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \
+ s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \
+ tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
+ tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
+ \
+ s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \
+ s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \
+ tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
+ tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
+ \
+ s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \
+ s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \
+ tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
+ tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
+ \
+ out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+ out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+}
+
+/* interleave macros */
+/* no in-place support */
+#define ILV_B_LRLR_UB(in0, in1, in2, in3, \
+ out0, out1, out2, out3) { \
+ out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \
+ out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \
+ out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \
+ out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \
+}
+
+#define ILV_H_LRLR_SH(in0, in1, in2, in3, \
+ out0, out1, out2, out3) { \
+ out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \
+ out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \
+ out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \
+ out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \
+}
+
+#define ILV_H_LR_SH(in0, in1, out0, out1) { \
+ out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \
+ out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \
+}
+
+#define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1) { \
+ out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \
+ out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \
+}
+
+#define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1) { \
+ out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \
+ out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \
+}
+
+#define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3) { \
+ ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1); \
+ ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \
+ out2, out3); \
+}
+
+#define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3) { \
+ ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1); \
+ ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
+ out2, out3); \
+}
+
+#define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \
+ in3_r, in4_r, in5_r, \
+ in0_l, in1_l, in2_l, \
+ in3_l, in4_l, in5_l, \
+ out0, out1, out2, \
+ out3, out4, out5) { \
+ ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1); \
+ ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
+ out2, out3); \
+ ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
+ out4, out5); \
+}
+
+#define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \
+ in4_r, in5_r, in6_r, in7_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ in4_l, in5_l, in6_l, in7_l, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7) { \
+ ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1); \
+ ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
+ out2, out3); \
+ ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
+ out4, out5); \
+ ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \
+ out6, out7); \
+}
+
+#define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1) { \
+ out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \
+ out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \
+}
+
+#define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3) { \
+ ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1); \
+ ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
+ out2, out3); \
+}
+
+#define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \
+ in3_r, in4_r, in5_r, \
+ in0_l, in1_l, in2_l, \
+ in3_l, in4_l, in5_l, \
+ out0, out1, out2, \
+ out3, out4, out5) { \
+ ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
+ out0, out1); \
+ ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
+ out2, out3); \
+ ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
+ out4, out5); \
+}
+
+#define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
+ out1, in1_l, in1_r) { \
+ out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \
+ out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \
+}
+
+#define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \
+ out1, in1_l, in1_r, \
+ out2, in2_l, in2_r) { \
+ ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
+ out1, in1_l, in1_r); \
+ out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \
+}
+
+#define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \
+ out1, in1_l, in1_r, \
+ out2, in2_l, in2_r, \
+ out3, in3_l, in3_r) { \
+ ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
+ out1, in1_l, in1_r); \
+ ILVR_D_2VECS_SB(out2, in2_l, in2_r, \
+ out3, in3_l, in3_r); \
+}
+
+#define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \
+ m2, c2, m3, c3, \
+ out0, out1, out2, out3) { \
+ out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \
+ out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \
+ out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \
+ out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \
+}
+
+#define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \
+ out0, out1) { \
+ out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \
+ out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \
+}
+
+#define XORI_B_2VECS_UB(val0, val1, \
+ out0, out1, xor_val) { \
+ out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \
+ out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \
+}
+
+#define XORI_B_2VECS_SB(val0, val1, \
+ out0, out1, xor_val) { \
+ out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \
+ out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \
+}
+
+#define XORI_B_3VECS_SB(val0, val1, val2, \
+ out0, out1, out2, xor_val) { \
+ XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \
+ out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \
+}
+
+#define XORI_B_4VECS_UB(val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ xor_val) { \
+ XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \
+ XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \
+}
+
+#define XORI_B_4VECS_SB(val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ xor_val) { \
+ XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \
+ XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \
+}
+
+#define XORI_B_7VECS_SB(val0, val1, val2, val3, \
+ val4, val5, val6, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, \
+ xor_val) { \
+ XORI_B_4VECS_SB(val0, val1, val2, val3, \
+ out0, out1, out2, out3, xor_val); \
+ XORI_B_3VECS_SB(val4, val5, val6, \
+ out4, out5, out6, xor_val); \
+}
+
+#define SRARI_H_4VECS_UH(val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ shift_right_val) { \
+ out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \
+ out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \
+ out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \
+ out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \
+}
+
+#define SRARI_H_4VECS_SH(val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ shift_right_val) { \
+ out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \
+ out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \
+ out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \
+ out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \
+}
+
+#define SRARI_W_4VECS_SW(val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ shift_right_val) { \
+ out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \
+ out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \
+ out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \
+ out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \
+}
+
+#define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \
+ v8u16 out_m; \
+ \
+ out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \
+ out_m = __msa_sat_u_h(out_m, (sat_val)); \
+ out_m; \
+})
+
+#define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \
+ v8i16 out_m; \
+ \
+ out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \
+ out_m = __msa_sat_s_h(out_m, (sat_val)); \
+ out_m; \
+})
+
+#define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \
+ pdst, stride) { \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ v16i8 tmp0_m; \
+ uint8_t *dst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
+ tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \
+ out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \
+ out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \
+ out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \
+ \
+ STORE_WORD(dst_m, out0_m); \
+ dst_m += stride; \
+ STORE_WORD(dst_m, out1_m); \
+ dst_m += stride; \
+ STORE_WORD(dst_m, out2_m); \
+ dst_m += stride; \
+ STORE_WORD(dst_m, out3_m); \
+}
+
+#define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \
+ in3, in4, \
+ pdst, stride) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ uint8_t *dst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
+ tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
+ \
+ tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \
+ tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
+ out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
+ out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
+ out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
+ \
+ STORE_DWORD(dst_m, out0_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out1_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out2_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out3_m); \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
+ tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \
+ STORE_SB(tmp_m, (pdest)); \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \
+ in2, dst1, \
+ in3, dst2, \
+ in4, dst3, \
+ pdst, stride) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *dst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
+ tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
+ \
+ tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \
+ tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \
+ \
+ tmp0_m = __msa_xori_b(tmp0_m, 128); \
+ tmp1_m = __msa_xori_b(tmp1_m, 128); \
+ \
+ tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \
+ tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
+ out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
+ out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
+ out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
+ \
+ STORE_DWORD(dst_m, out0_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out1_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out2_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out3_m); \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
+ tmp_m = __msa_xori_b(tmp_m, 128); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \
+ STORE_UB(tmp_m, (pdest)); \
+}
+
+#define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \
+ pdst, stride) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ uint8_t *dst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
+ tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
+ out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
+ out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
+ out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
+ \
+ STORE_DWORD(dst_m, out0_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out1_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out2_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out3_m); \
+}
+
+/* Only for unsigned vecs */
+#define PCKEV_B_STORE_VEC(in1, in2, pdest) { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
+ STORE_SB(tmp_m, (pdest)); \
+}
+
+#define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \
+ in3, dst2, in4, dst3, \
+ pdst, stride) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *dst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
+ tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
+ \
+ tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \
+ tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \
+ \
+ tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \
+ tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
+ out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
+ out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
+ out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
+ \
+ STORE_DWORD(dst_m, out0_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out1_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out2_m); \
+ dst_m += stride; \
+ STORE_DWORD(dst_m, out3_m); \
+}
+
+#define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \
+ STORE_UB(tmp_m, (pdest)); \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_4(in0, in1, in2, in3, \
+ out0, out1, out2, out3) { \
+ out0 = (in0) + (in3); \
+ out1 = (in1) + (in2); \
+ \
+ out2 = (in1) - (in2); \
+ out3 = (in0) - (in3); \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_8(in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7) { \
+ out0 = (in0) + (in7); \
+ out1 = (in1) + (in6); \
+ out2 = (in2) + (in5); \
+ out3 = (in3) + (in4); \
+ \
+ out4 = (in3) - (in4); \
+ out5 = (in2) - (in5); \
+ out6 = (in1) - (in6); \
+ out7 = (in0) - (in7); \
+}
+#endif /* HAVE_MSA */
+#endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
index 8b3b9dbe0f7..7db210c3ae0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,15 +17,22 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_systemdependent.h"
-static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
- int i;
-
- // Top border row
- vpx_memset(mi, 0, sizeof(*mi) * cm->mi_stride);
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
- // Left border column
- for (i = 1; i < cm->mi_rows + 1; ++i)
- vpx_memset(&mi[i * cm->mi_stride], 0, sizeof(*mi));
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
}
void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
@@ -41,73 +48,70 @@ void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
cm->MBs = cm->mb_rows * cm->mb_cols;
}
-static void setup_mi(VP9_COMMON *cm) {
- cm->mi = cm->mip + cm->mi_stride + 1;
- cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
-
- vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
- clear_mi_border(cm, cm->prev_mip);
-}
-
-static int alloc_mi(VP9_COMMON *cm, int mi_size) {
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
int i;
- for (i = 0; i < 2; ++i) {
- cm->mip_array[i] =
- (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO));
- if (cm->mip_array[i] == NULL)
+ for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+ cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+ if (cm->seg_map_array[i] == NULL)
return 1;
}
- cm->mi_alloc_size = mi_size;
-
// Init the index.
- cm->mi_idx = 0;
- cm->prev_mi_idx = 1;
+ cm->seg_map_idx = 0;
+ cm->prev_seg_map_idx = 1;
- cm->mip = cm->mip_array[cm->mi_idx];
- cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+ cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+ if (!cm->frame_parallel_decode)
+ cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
return 0;
}
-static void free_mi(VP9_COMMON *cm) {
+static void free_seg_map(VP9_COMMON *cm) {
int i;
- for (i = 0; i < 2; ++i) {
- vpx_free(cm->mip_array[i]);
- cm->mip_array[i] = NULL;
+ for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+ vpx_free(cm->seg_map_array[i]);
+ cm->seg_map_array[i] = NULL;
}
- cm->mip = NULL;
- cm->prev_mip = NULL;
+ cm->current_frame_seg_map = NULL;
+
+ if (!cm->frame_parallel_decode) {
+ cm->last_frame_seg_map = NULL;
+ }
}
-void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
+void vp9_free_ref_frame_buffers(BufferPool *pool) {
int i;
for (i = 0; i < FRAME_BUFFERS; ++i) {
- vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
-
- if (cm->frame_bufs[i].ref_count > 0 &&
- cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
- cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
- cm->frame_bufs[i].ref_count = 0;
+ if (pool->frame_bufs[i].ref_count > 0 &&
+ pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+ pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+ pool->frame_bufs[i].ref_count = 0;
}
+ vpx_free(pool->frame_bufs[i].mvs);
+ pool->frame_bufs[i].mvs = NULL;
+ vp9_free_frame_buffer(&pool->frame_bufs[i].buf);
}
+}
+void vp9_free_postproc_buffers(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
vp9_free_frame_buffer(&cm->post_proc_buffer);
+ vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#else
+ (void)cm;
+#endif
}
void vp9_free_context_buffers(VP9_COMMON *cm) {
- free_mi(cm);
-
- vpx_free(cm->last_frame_seg_map);
- cm->last_frame_seg_map = NULL;
-
+ cm->free_mi(cm);
+ free_seg_map(cm);
vpx_free(cm->above_context);
cm->above_context = NULL;
-
vpx_free(cm->above_seg_context);
cm->above_seg_context = NULL;
}
@@ -116,11 +120,13 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
vp9_free_context_buffers(cm);
vp9_set_mb_mi(cm, width, height);
- if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
+ if (cm->alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
goto fail;
- cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
- if (!cm->last_frame_seg_map) goto fail;
+ // Create the segmentation map structure and set to 0.
+ free_seg_map(cm);
+ if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+ goto fail;
cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
@@ -138,77 +144,27 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
return 1;
}
-static void init_frame_bufs(VP9_COMMON *cm) {
- int i;
-
- cm->new_fb_idx = FRAME_BUFFERS - 1;
- cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
- for (i = 0; i < REF_FRAMES; ++i) {
- cm->ref_frame_map[i] = i;
- cm->frame_bufs[i].ref_count = 1;
- }
-}
-
-int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
- int i;
- const int ss_x = cm->subsampling_x;
- const int ss_y = cm->subsampling_y;
-
- vp9_free_ref_frame_buffers(cm);
-
- for (i = 0; i < FRAME_BUFFERS; ++i) {
- cm->frame_bufs[i].ref_count = 0;
- if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
- ss_x, ss_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif
- VP9_ENC_BORDER_IN_PIXELS) < 0)
- goto fail;
- }
-
- init_frame_bufs(cm);
-
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
- if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif
- VP9_ENC_BORDER_IN_PIXELS) < 0)
- goto fail;
-#endif
-
- return 0;
-
- fail:
- vp9_free_ref_frame_buffers(cm);
- return 1;
-}
-
void vp9_remove_common(VP9_COMMON *cm) {
- vp9_free_ref_frame_buffers(cm);
vp9_free_context_buffers(cm);
- vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
+
+ vpx_free(cm->fc);
+ cm->fc = NULL;
+ vpx_free(cm->frame_contexts);
+ cm->frame_contexts = NULL;
}
void vp9_init_context_buffers(VP9_COMMON *cm) {
- setup_mi(cm);
- if (cm->last_frame_seg_map)
- vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+ cm->setup_mi(cm);
+ if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
}
-void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) {
// Swap indices.
- const int tmp = cm->mi_idx;
- cm->mi_idx = cm->prev_mi_idx;
- cm->prev_mi_idx = tmp;
-
- // Current mip will be the prev_mip for the next frame.
- cm->mip = cm->mip_array[cm->mi_idx];
- cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+ const int tmp = cm->seg_map_idx;
+ cm->seg_map_idx = cm->prev_seg_map_idx;
+ cm->prev_seg_map_idx = tmp;
- // Update the upper left visible macroblock ptrs.
- cm->mi = cm->mip + cm->mi_stride + 1;
- cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+ cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+ cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
index c5b893facca..c0e51a6ce64 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
@@ -12,11 +12,14 @@
#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
#define VP9_COMMON_VP9_ALLOCCOMMON_H_
+#define INVALID_IDX -1 // Invalid buffer index.
+
#ifdef __cplusplus
extern "C" {
#endif
struct VP9Common;
+struct BufferPool;
void vp9_remove_common(struct VP9Common *cm);
@@ -24,14 +27,15 @@ int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
void vp9_init_context_buffers(struct VP9Common *cm);
void vp9_free_context_buffers(struct VP9Common *cm);
-int vp9_alloc_ref_frame_buffers(struct VP9Common *cm, int width, int height);
-void vp9_free_ref_frame_buffers(struct VP9Common *cm);
+void vp9_free_ref_frame_buffers(struct BufferPool *pool);
+void vp9_free_postproc_buffers(struct VP9Common *cm);
int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
void vp9_free_state_buffers(struct VP9Common *cm);
void vp9_set_mb_mi(struct VP9Common *cm, int width, int height);
-void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);
+
+void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
index 7094a0118c0..b2bb1818893 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
@@ -40,7 +40,7 @@ void vp9_foreach_transformed_block_in_plane(
const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
foreach_transformed_block_visitor visit, void *arg) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const MB_MODE_INFO* mbmi = &xd->mi[0].src_mi->mbmi;
+ const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
// block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
// 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
// transform size varies per plane, look it up in a common way.
@@ -50,39 +50,25 @@ void vp9_foreach_transformed_block_in_plane(
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
const int step = 1 << (tx_size << 1);
- int i;
+ int i = 0, r, c;
// If mb_to_right_edge is < 0 we are in a situation in which
// the current block size extends into the UMV and we won't
// visit the sub blocks that are wholly within the UMV.
- if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
- int r, c;
-
- int max_blocks_wide = num_4x4_w;
- int max_blocks_high = num_4x4_h;
-
- // xd->mb_to_right_edge is in units of pixels * 8. This converts
- // it to 4x4 block sizes.
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
- i = 0;
- // Unlike the normal case - in here we have to keep track of the
- // row and column of the blocks we use so that we know if we are in
- // the unrestricted motion border.
- for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
- for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
- if (r < max_blocks_high && c < max_blocks_wide)
- visit(plane, i, plane_bsize, tx_size, arg);
- i += step;
- }
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+ xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+ xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+ for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ if (c < max_blocks_wide)
+ visit(plane, i, plane_bsize, tx_size, arg);
+ i += step;
}
- } else {
- for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
- visit(plane, i, plane_bsize, tx_size, arg);
}
}
@@ -117,7 +103,7 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
for (i = above_contexts; i < tx_size_in_blocks; ++i)
a[i] = 0;
} else {
- vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+ memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
}
// left
@@ -134,7 +120,7 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
for (i = left_contexts; i < tx_size_in_blocks; ++i)
l[i] = 0;
} else {
- vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+ memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
index 1234d54c7f0..018a9c2b975 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
@@ -126,10 +126,10 @@ typedef struct {
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
uint8_t mode_context[MAX_REF_FRAMES];
INTERP_FILTER interp_filter;
+
} MB_MODE_INFO;
typedef struct MODE_INFO {
- struct MODE_INFO *src_mi;
MB_MODE_INFO mbmi;
b_mode_info bmi[4];
} MODE_INFO;
@@ -190,7 +190,11 @@ typedef struct macroblockd {
int mi_stride;
- MODE_INFO *mi;
+ MODE_INFO **mi;
+ MODE_INFO *left_mi;
+ MODE_INFO *above_mi;
+ MB_MODE_INFO *left_mbmi;
+ MB_MODE_INFO *above_mbmi;
int up_available;
int left_available;
@@ -207,6 +211,12 @@ typedef struct macroblockd {
/* pointer to current frame */
const YV12_BUFFER_CONFIG *cur_buf;
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
+
/* mc buffer */
DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
@@ -216,17 +226,13 @@ typedef struct macroblockd {
DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
#endif
- int lossless;
+ /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
+ int lossless;
int corrupted;
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
-
- ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
- ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT left_seg_context[8];
+ struct vpx_internal_error_info *error_info;
} MACROBLOCKD;
static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
@@ -238,16 +244,17 @@ extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
const MACROBLOCKD *xd) {
- const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
- if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+ if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
return DCT_DCT;
+
return intra_mode_to_tx_type_lookup[mbmi->mode];
}
static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
const MACROBLOCKD *xd, int ib) {
- const MODE_INFO *const mi = xd->mi[0].src_mi;
+ const MODE_INFO *const mi = xd->mi[0];
if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
return DCT_DCT;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
index 6801dd3a2b6..d06b8e0405e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
@@ -36,17 +36,17 @@ extern "C" {
// Only need this for fixed-size arrays, for structs just assign.
#define vp9_copy(dest, src) { \
assert(sizeof(dest) == sizeof(src)); \
- vpx_memcpy(dest, src, sizeof(src)); \
+ memcpy(dest, src, sizeof(src)); \
}
// Use this for variably-sized arrays.
#define vp9_copy_array(dest, src, n) { \
assert(sizeof(*dest) == sizeof(*src)); \
- vpx_memcpy(dest, src, n * sizeof(*src)); \
+ memcpy(dest, src, n * sizeof(*src)); \
}
-#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
+#define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
static INLINE uint8_t clip_pixel(int val) {
return (val > 255) ? 255 : (val < 0) ? 0 : val;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
index 7b65651ba88..90e337fd66e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
@@ -236,7 +236,7 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
@@ -256,7 +256,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
(void)filter_y; (void)filter_y_stride;
for (r = h; r > 0; --r) {
- vpx_memcpy(dst, src, w);
+ memcpy(dst, src, w);
src += src_stride;
dst += dst_stride;
}
@@ -501,7 +501,7 @@ void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
- DECLARE_ALIGNED_ARRAY(16, uint16_t, temp, 64 * 64);
+ DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
@@ -526,7 +526,7 @@ void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
(void)bd;
for (r = h; r > 0; --r) {
- vpx_memcpy(dst, src, w * sizeof(uint16_t));
+ memcpy(dst, src, w * sizeof(uint16_t));
src += src_stride;
dst += dst_stride;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
index d9dace6ac8a..3d80103d21b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
@@ -25,55 +25,65 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
size_t member_offset) {
int mi_row, mi_col;
- int mi_index = 0;
- // TODO(hkuang): Fix this debug function.
- MODE_INFO **mi = &cm->mi;
+ MODE_INFO **mi = cm->mi_grid_visible;
int rows = cm->mi_rows;
int cols = cm->mi_cols;
char prefix = descriptor[0];
log_frame_info(cm, descriptor, file);
- mi_index = 0;
for (mi_row = 0; mi_row < rows; mi_row++) {
fprintf(file, "%c ", prefix);
for (mi_col = 0; mi_col < cols; mi_col++) {
fprintf(file, "%2d ",
- *((int*) ((char *) (&mi[mi_index]->mbmi) +
- member_offset)));
- mi_index++;
+ *((int*) ((char *) (&mi[0]->mbmi) +
+ member_offset)));
+ mi++;
}
fprintf(file, "\n");
- mi_index += 8;
+ mi += 8;
}
fprintf(file, "\n");
}
+
void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
int mi_row;
int mi_col;
- int mi_index = 0;
FILE *mvs = fopen(file, "a");
- // TODO(hkuang): Fix this debug function.
- MODE_INFO **mi = &cm->mi;
+ MODE_INFO **mi = cm->mi_grid_visible;
int rows = cm->mi_rows;
int cols = cm->mi_cols;
print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
- print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+ // output skip infomation.
+ log_frame_info(cm, "Skips:", mvs);
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs, "S ");
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+ mi++;
+ }
+ fprintf(mvs, "\n");
+ mi += 8;
+ }
+ fprintf(mvs, "\n");
+
+ // output motion vectors.
log_frame_info(cm, "Vectors ", mvs);
+ mi = cm->mi_grid_visible;
for (mi_row = 0; mi_row < rows; mi_row++) {
fprintf(mvs, "V ");
for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
- mi[mi_index]->mbmi.mv[0].as_mv.col);
- mi_index++;
+ fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
+ mi[0]->mbmi.mv[0].as_mv.col);
+ mi++;
}
fprintf(mvs, "\n");
- mi_index += 8;
+ mi += 8;
}
fprintf(mvs, "\n");
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
index c3fdeb48a6c..a2584e8da5b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
@@ -15,6 +15,18 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx/vpx_integer.h"
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+ 2, 6, // 0 = LOW_VAL
+ -TWO_TOKEN, 4, // 1 = TWO
+ -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE
+ 8, 10, // 3 = HIGH_LOW
+ -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE
+ 12, 14, // 5 = CAT_THREEFOUR
+ -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE
+ -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE
+};
+
const vp9_prob vp9_cat1_prob[] = { 159 };
const vp9_prob vp9_cat2_prob[] = { 165, 145 };
const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };
@@ -737,21 +749,21 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
};
static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
- vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
- MODEL_NODES * sizeof(vp9_prob));
+ memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
+ MODEL_NODES * sizeof(vp9_prob));
}
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
if (full != model)
- vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+ memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
}
void vp9_default_coef_probs(VP9_COMMON *cm) {
- vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
- vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
- vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
- vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+ vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+ vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+ vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+ vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
}
#define COEF_COUNT_SAT 24
@@ -765,7 +777,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
unsigned int count_sat,
unsigned int update_factor) {
const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
- vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size];
+ vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
index 239c0494c63..5a9007b5417 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
@@ -81,6 +81,7 @@ typedef struct {
const vp9_prob *prob;
int len;
int base_val;
+ const int16_t *cost;
} vp9_extra_bit;
// indexed by token value
@@ -141,10 +142,10 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
for (i = 0; i < MAX_MB_PLANE; i++) {
struct macroblockd_plane *const pd = &xd->plane[i];
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
- vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
- num_4x4_blocks_wide_lookup[plane_bsize]);
- vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
- num_4x4_blocks_high_lookup[plane_bsize]);
+ memset(pd->above_context, 0,
+ sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+ memset(pd->left_context, 0,
+ sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
}
}
@@ -172,6 +173,7 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
#define PIVOT_NODE 2 // which node is pivot
#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
@@ -214,7 +216,7 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
PLANE_TYPE type, int block_idx) {
- const MODE_INFO *const mi = xd->mi[0].src_mi;
+ const MODE_INFO *const mi = xd->mi[0];
if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
return &vp9_default_scan_orders[tx_size];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
index 5b00b0082a1..424451fee39 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
@@ -334,60 +334,48 @@ const vp9_tree_index vp9_switchable_interp_tree
-EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
};
-#define COUNT_SAT 20
-#define MAX_UPDATE_FACTOR 128
-
-static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
- return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree,
- const vp9_prob *pre_probs, const unsigned int *counts,
- vp9_prob *probs) {
- vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
- probs);
-}
-
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
int i, j;
- FRAME_CONTEXT *fc = &cm->fc;
+ FRAME_CONTEXT *fc = cm->fc;
const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
const FRAME_COUNTS *counts = &cm->counts;
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
- counts->intra_inter[i]);
+ fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+ counts->intra_inter[i]);
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
- counts->comp_inter[i]);
+ fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+ counts->comp_inter[i]);
for (i = 0; i < REF_CONTEXTS; i++)
- fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
- counts->comp_ref[i]);
+ fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+ counts->comp_ref[i]);
for (i = 0; i < REF_CONTEXTS; i++)
for (j = 0; j < 2; j++)
- fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
- counts->single_ref[i][j]);
+ fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+ pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
for (i = 0; i < INTER_MODE_CONTEXTS; i++)
- adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+ vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
counts->inter_mode[i], fc->inter_mode_probs[i]);
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
- adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+ vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
counts->y_mode[i], fc->y_mode_prob[i]);
for (i = 0; i < INTRA_MODES; ++i)
- adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
- counts->uv_mode[i], fc->uv_mode_prob[i]);
+ vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+ counts->uv_mode[i], fc->uv_mode_prob[i]);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
- counts->partition[i], fc->partition_prob[i]);
+ vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+ counts->partition[i], fc->partition_prob[i]);
if (cm->interp_filter == SWITCHABLE) {
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
- adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
- counts->switchable_interp[i], fc->switchable_interp_prob[i]);
+ vp9_tree_merge_probs(vp9_switchable_interp_tree,
+ pre_fc->switchable_interp_prob[i],
+ counts->switchable_interp[i],
+ fc->switchable_interp_prob[i]);
}
if (cm->tx_mode == TX_MODE_SELECT) {
@@ -399,23 +387,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
for (j = 0; j < TX_SIZES - 3; ++j)
- fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
- branch_ct_8x8p[j]);
+ fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+ pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
for (j = 0; j < TX_SIZES - 2; ++j)
- fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
- branch_ct_16x16p[j]);
+ fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+ pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
for (j = 0; j < TX_SIZES - 1; ++j)
- fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
- branch_ct_32x32p[j]);
+ fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+ pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
}
}
for (i = 0; i < SKIP_CONTEXTS; ++i)
- fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+ fc->skip_probs[i] = mode_mv_merge_probs(
+ pre_fc->skip_probs[i], counts->skip[i]);
}
static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -439,8 +428,12 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
int i;
vp9_clearall_segfeatures(&cm->seg);
cm->seg.abs_delta = SEGMENT_DELTADATA;
- if (cm->last_frame_seg_map)
- vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+ if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+ if (cm->current_frame_seg_map)
+ memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
// Reset the mode ref deltas for loop filter
vp9_zero(lf->last_ref_deltas);
@@ -451,24 +444,24 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
lf->last_sharpness_level = -1;
vp9_default_coef_probs(cm);
- vp9_init_mode_probs(&cm->fc);
+ vp9_init_mode_probs(cm->fc);
vp9_init_mv_probs(cm);
+ cm->fc->initialized = 1;
if (cm->frame_type == KEY_FRAME ||
cm->error_resilient_mode || cm->reset_frame_context == 3) {
// Reset all frame contexts.
for (i = 0; i < FRAME_CONTEXTS; ++i)
- cm->frame_contexts[i] = cm->fc;
+ cm->frame_contexts[i] = *cm->fc;
} else if (cm->reset_frame_context == 2) {
// Reset only the frame context specified in the frame header.
- cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
}
- if (frame_is_intra_only(cm))
- vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
- sizeof(*cm->prev_mip));
-
- vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ // prev_mip will only be allocated in encoder.
+ if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+ memset(cm->prev_mip, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
vp9_zero(cm->ref_frame_sign_bias);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
index 6831d3f8738..f4e20e1af8b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
@@ -33,6 +33,7 @@ struct tx_counts {
unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+ unsigned int tx_totals[TX_SIZES];
};
typedef struct frame_contexts {
@@ -50,9 +51,10 @@ typedef struct frame_contexts {
struct tx_probs tx_probs;
vp9_prob skip_probs[SKIP_CONTEXTS];
nmv_context nmvc;
+ int initialized;
} FRAME_CONTEXT;
-typedef struct {
+typedef struct FRAME_COUNTS {
unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
index 5bb048202b7..2477e6ef326 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
@@ -11,9 +11,6 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_entropymv.h"
-#define MV_COUNT_SAT 20
-#define MV_MAX_UPDATE_FACTOR 128
-
// Integer pel reference mv threshold for use of high-precision 1/8 mv
#define COMPANDED_MVREF_THRESH 8
@@ -183,51 +180,43 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
}
}
-static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
- return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
- const unsigned int *counts, vp9_prob *probs) {
- vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
- MV_MAX_UPDATE_FACTOR, probs);
-}
-
void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
int i, j;
- nmv_context *fc = &cm->fc.nmvc;
+ nmv_context *fc = &cm->fc->nmvc;
const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
const nmv_context_counts *counts = &cm->counts.mv;
- adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
+ vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+ fc->joints);
for (i = 0; i < 2; ++i) {
nmv_component *comp = &fc->comps[i];
const nmv_component *pre_comp = &pre_fc->comps[i];
const nmv_component_counts *c = &counts->comps[i];
- comp->sign = adapt_prob(pre_comp->sign, c->sign);
- adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
- comp->classes);
- adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
+ comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+ vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+ comp->classes);
+ vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+ comp->class0);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
+ comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
for (j = 0; j < CLASS0_SIZE; ++j)
- adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
- comp->class0_fp[j]);
+ vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+ c->class0_fp[j], comp->class0_fp[j]);
- adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+ vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
if (allow_hp) {
- comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
- comp->hp = adapt_prob(pre_comp->hp, c->hp);
+ comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+ comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
}
}
}
void vp9_init_mv_probs(VP9_COMMON *cm) {
- cm->fc.nmvc = default_nmv_context;
+ cm->fc->nmvc = default_nmv_context;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
index f83d21fe3a3..7938fc10a11 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
@@ -99,17 +99,6 @@ typedef enum {
} TX_TYPE;
typedef enum {
- UNKNOWN = 0,
- BT_601 = 1, // YUV
- BT_709 = 2, // YUV
- SMPTE_170 = 3, // YUV
- SMPTE_240 = 4, // YUV
- RESERVED_1 = 5,
- RESERVED_2 = 6,
- SRGB = 7 // RGB
-} COLOR_SPACE;
-
-typedef enum {
VP9_LAST_FLAG = 1 << 0,
VP9_GOLD_FLAG = 1 << 1,
VP9_ALT_FLAG = 1 << 2,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
index 3377d45fc0e..d963ee23569 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
@@ -31,17 +31,14 @@ typedef enum {
EIGHTTAP = 0,
EIGHTTAP_SMOOTH = 1,
EIGHTTAP_SHARP = 2,
+ SWITCHABLE_FILTERS = 3, /* Number of switchable filters */
BILINEAR = 3,
+ // The codec can operate in four possible inter prediction filter mode:
+ // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+ SWITCHABLE_FILTER_CONTEXTS = SWITCHABLE_FILTERS + 1,
SWITCHABLE = 4 /* should be the last one */
} INTERP_FILTER;
-// Number of switchable filters
-#define SWITCHABLE_FILTERS 3
-
-// The codec can operate in four possible inter prediction filter mode:
-// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
-#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-
typedef int16_t InterpKernel[SUBPEL_TAPS];
const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
index 34795b74ec2..0f41d66985f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
@@ -64,7 +64,7 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
// This memset is needed for fixing valgrind error from C loop filter
// due to access uninitialized memory in frame border. It could be
// removed if border is totally removed.
- vpx_memset(int_fb_list->int_fb[i].data, 0, min_size);
+ memset(int_fb_list->int_fb[i].data, 0, min_size);
int_fb_list->int_fb[i].size = min_size;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
index d5b6f39b3cd..3b214371c47 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
@@ -11,39 +11,9 @@
#include <math.h>
#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows in the inverse transform is considered invalid in VP9,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) (x)
-#endif // CONFIG_EMULATE_HARDWARE
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
- int bd) {
- trans = WRAPLOW(trans, bd);
- return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_systemdependent.h"
static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
trans = WRAPLOW(trans, 8);
@@ -276,10 +246,10 @@ void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
static void iadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- tran_high_t x0 = input[0];
- tran_high_t x1 = input[1];
- tran_high_t x2 = input[2];
- tran_high_t x3 = input[3];
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
@@ -295,24 +265,19 @@ static void iadst4(const tran_low_t *input, tran_low_t *output) {
s6 = sinpi_4_9 * x3;
s7 = x0 - x2 + x3;
- x0 = s0 + s3 + s5;
- x1 = s1 - s4 - s6;
- x2 = sinpi_3_9 * s7;
- x3 = s2;
-
- s0 = x0 + x3;
- s1 = x1 + x3;
- s2 = x2;
- s3 = x0 + x1 - x3;
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
// 1-D transform scaling factor is sqrt(2).
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
- output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
+ output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+ output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
- output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
+ output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
}
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -367,14 +332,14 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) {
}
// stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+ s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+ s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+ s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
@@ -386,14 +351,14 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) {
x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
// stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+ s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+ s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+ s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
x0 = WRAPLOW(s0 + s2, 8);
x1 = WRAPLOW(s1 + s3, 8);
@@ -405,10 +370,10 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) {
x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
// stage 3
- s2 = cospi_16_64 * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
+ s2 = (int)(cospi_16_64 * (x2 + x3));
+ s3 = (int)(cospi_16_64 * (x2 - x3));
+ s6 = (int)(cospi_16_64 * (x6 + x7));
+ s7 = (int)(cospi_16_64 * (x6 - x7));
x2 = WRAPLOW(dct_const_round_shift(s2), 8);
x3 = WRAPLOW(dct_const_round_shift(s3), 8);
@@ -1311,7 +1276,7 @@ void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
if (zero_coeff[0] | zero_coeff[1])
idct32(input, outptr);
else
- vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
outptr += 32;
}
@@ -1545,19 +1510,19 @@ void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
}
}
-static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step[4];
tran_high_t temp1, temp2;
(void) bd;
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2
output[0] = WRAPLOW(step[0] + step[3], bd);
@@ -1576,7 +1541,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
// Rows
for (i = 0; i < 4; ++i) {
- highbd_idct4(input, outptr, bd);
+ vp9_highbd_idct4(input, outptr, bd);
input += 4;
outptr += 4;
}
@@ -1585,7 +1550,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
- highbd_idct4(temp_in, temp_out, bd);
+ vp9_highbd_idct4(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
@@ -1597,10 +1562,11 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
int dest_stride, int bd) {
int i;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) {
@@ -1612,7 +1578,7 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[8], step2[8];
tran_high_t temp1, temp2;
// stage 1
@@ -1622,15 +1588,15 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
step1[3] = input[6];
temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2 & stage 3 - even half
- highbd_idct4(step1, step1, bd);
+ vp9_highbd_idct4(step1, step1, bd);
// stage 2 - odd half
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
@@ -1642,8 +1608,8 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
// stage 4
@@ -1667,7 +1633,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows.
for (i = 0; i < 8; ++i) {
- highbd_idct8(input, outptr, bd);
+ vp9_highbd_idct8(input, outptr, bd);
input += 8;
outptr += 8;
}
@@ -1676,7 +1642,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
- highbd_idct8(temp_in, temp_out, bd);
+ vp9_highbd_idct8(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1688,9 +1654,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i)
@@ -1702,14 +1669,14 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- tran_high_t x0 = input[0];
- tran_high_t x1 = input[1];
- tran_high_t x2 = input[2];
- tran_high_t x3 = input[3];
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
(void) bd;
if (!(x0 | x1 | x2 | x3)) {
- vpx_memset(output, 0, 4 * sizeof(*output));
+ memset(output, 0, 4 * sizeof(*output));
return;
}
@@ -1720,34 +1687,29 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
s4 = sinpi_1_9 * x2;
s5 = sinpi_2_9 * x3;
s6 = sinpi_4_9 * x3;
- s7 = x0 - x2 + x3;
+ s7 = (tran_high_t)(x0 - x2 + x3);
- x0 = s0 + s3 + s5;
- x1 = s1 - s4 - s6;
- x2 = sinpi_3_9 * s7;
- x3 = s2;
-
- s0 = x0 + x3;
- s1 = x1 + x3;
- s2 = x2;
- s3 = x0 + x1 - x3;
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
// 1-D transform scaling factor is sqrt(2).
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
- output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
- output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
- output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
+ output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+ output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+ output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
}
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
- { highbd_idct4, highbd_idct4 }, // DCT_DCT = 0
- { highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1
- { highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
+ { vp9_highbd_idct4, vp9_highbd_idct4 }, // DCT_DCT = 0
+ { highbd_iadst4, vp9_highbd_idct4 }, // ADST_DCT = 1
+ { vp9_highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
{ highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3
};
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1779,18 +1741,18 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- tran_high_t x0 = input[7];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[5];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[3];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[1];
- tran_high_t x7 = input[6];
+ tran_low_t x0 = input[7];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[5];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[3];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[1];
+ tran_low_t x7 = input[6];
(void) bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- vpx_memset(output, 0, 8 * sizeof(*output));
+ memset(output, 0, 8 * sizeof(*output));
return;
}
@@ -1804,14 +1766,14 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
// stage 2
s0 = x0;
@@ -1827,10 +1789,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
x1 = WRAPLOW(s1 + s3, bd);
x2 = WRAPLOW(s0 - s2, bd);
x3 = WRAPLOW(s1 - s3, bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
// stage 3
s2 = cospi_16_64 * (x2 + x3);
@@ -1838,10 +1800,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (x6 - x7);
- x2 = WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
output[0] = WRAPLOW(x0, bd);
output[1] = WRAPLOW(-x4, bd);
@@ -1854,9 +1816,9 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
}
static const highbd_transform_2d HIGH_IHT_8[] = {
- { highbd_idct8, highbd_idct8 }, // DCT_DCT = 0
- { highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1
- { highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
+ { vp9_highbd_idct8, vp9_highbd_idct8 }, // DCT_DCT = 0
+ { highbd_iadst8, vp9_highbd_idct8 }, // ADST_DCT = 1
+ { vp9_highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
{ highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3
};
@@ -1899,7 +1861,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows.
// Only first 4 row has non-zero coefs.
for (i = 0; i < 4; ++i) {
- highbd_idct8(input, outptr, bd);
+ vp9_highbd_idct8(input, outptr, bd);
input += 8;
outptr += 8;
}
@@ -1907,7 +1869,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
- highbd_idct8(temp_in, temp_out, bd);
+ vp9_highbd_idct8(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1915,7 +1877,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[16], step2[16];
tran_high_t temp1, temp2;
(void) bd;
@@ -1950,23 +1912,23 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 3
step1[0] = step2[0];
@@ -1976,12 +1938,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -1995,12 +1957,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2010,12 +1972,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -2027,8 +1989,8 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2053,12 +2015,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2091,7 +2053,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows.
for (i = 0; i < 16; ++i) {
- highbd_idct16(input, outptr, bd);
+ vp9_highbd_idct16(input, outptr, bd);
input += 16;
outptr += 16;
}
@@ -2100,7 +2062,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
- highbd_idct16(temp_in, temp_out, bd);
+ vp9_highbd_idct16(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2113,27 +2075,27 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
- tran_high_t x0 = input[15];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[13];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[11];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[9];
- tran_high_t x7 = input[6];
- tran_high_t x8 = input[7];
- tran_high_t x9 = input[8];
- tran_high_t x10 = input[5];
- tran_high_t x11 = input[10];
- tran_high_t x12 = input[3];
- tran_high_t x13 = input[12];
- tran_high_t x14 = input[1];
- tran_high_t x15 = input[14];
+ tran_low_t x0 = input[15];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[13];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[11];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[9];
+ tran_low_t x7 = input[6];
+ tran_low_t x8 = input[7];
+ tran_low_t x9 = input[8];
+ tran_low_t x10 = input[5];
+ tran_low_t x11 = input[10];
+ tran_low_t x12 = input[3];
+ tran_low_t x13 = input[12];
+ tran_low_t x14 = input[1];
+ tran_low_t x15 = input[14];
(void) bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
- vpx_memset(output, 0, 16 * sizeof(*output));
+ memset(output, 0, 16 * sizeof(*output));
return;
}
@@ -2155,22 +2117,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
// stage 2
s0 = x0;
@@ -2198,14 +2160,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
x5 = WRAPLOW(s1 - s5, bd);
x6 = WRAPLOW(s2 - s6, bd);
x7 = WRAPLOW(s3 - s7, bd);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
// stage 3
s0 = x0;
@@ -2229,18 +2191,18 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
x1 = WRAPLOW(s1 + s3, bd);
x2 = WRAPLOW(s0 - s2, bd);
x3 = WRAPLOW(s1 - s3, bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
x8 = WRAPLOW(s8 + s10, bd);
x9 = WRAPLOW(s9 + s11, bd);
x10 = WRAPLOW(s8 - s10, bd);
x11 = WRAPLOW(s9 - s11, bd);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
// stage 4
s2 = (- cospi_16_64) * (x2 + x3);
@@ -2252,14 +2214,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
s14 = (- cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
- x2 = WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7), bd);
- x10 = WRAPLOW(dct_const_round_shift(s10), bd);
- x11 = WRAPLOW(dct_const_round_shift(s11), bd);
- x14 = WRAPLOW(dct_const_round_shift(s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s15), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
output[0] = WRAPLOW(x0, bd);
output[1] = WRAPLOW(-x8, bd);
@@ -2280,9 +2242,9 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
}
static const highbd_transform_2d HIGH_IHT_16[] = {
- { highbd_idct16, highbd_idct16 }, // DCT_DCT = 0
- { highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1
- { highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
+ { vp9_highbd_idct16, vp9_highbd_idct16 }, // DCT_DCT = 0
+ { highbd_iadst16, vp9_highbd_idct16 }, // ADST_DCT = 1
+ { vp9_highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
{ highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3
};
@@ -2325,7 +2287,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
- highbd_idct16(input, outptr, bd);
+ vp9_highbd_idct16(input, outptr, bd);
input += 16;
outptr += 16;
}
@@ -2334,7 +2296,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
- highbd_idct16(temp_in, temp_out, bd);
+ vp9_highbd_idct16(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2346,10 +2308,11 @@ void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i)
@@ -2383,43 +2346,43 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2
step2[0] = step1[0];
@@ -2433,23 +2396,23 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[16] = WRAPLOW(step1[16] + step1[17], bd);
step2[17] = WRAPLOW(step1[16] - step1[17], bd);
@@ -2476,12 +2439,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -2496,22 +2459,22 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[31] = step2[31];
temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[19] = step2[19];
step1[20] = step2[20];
temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[23] = step2[23];
step1[24] = step2[24];
step1[27] = step2[27];
@@ -2520,12 +2483,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2535,12 +2498,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -2570,8 +2533,8 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2587,20 +2550,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[17] = step2[17];
temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[22] = step2[22];
step1[23] = step2[23];
step1[24] = step2[24];
@@ -2621,12 +2584,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2672,20 +2635,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
step1[19] = step2[19];
temp1 = (-step2[20] + step2[27]) * cospi_16_64;
temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[21] + step2[26]) * cospi_16_64;
temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[22] + step2[25]) * cospi_16_64;
temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[23] + step2[24]) * cospi_16_64;
temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[28] = step2[28];
step1[29] = step2[29];
step1[30] = step2[30];
@@ -2749,7 +2712,7 @@ void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
if (zero_coeff[0] | zero_coeff[1])
highbd_idct32(input, outptr, bd);
else
- vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+ memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
outptr += 32;
}
@@ -2799,8 +2762,9 @@ void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
int a1;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
index 12569b9dee9..6e2551dd4bc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
@@ -29,10 +29,12 @@ extern "C" {
#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
#define pair_set_epi16(a, b) \
- _mm_set_epi16(b, a, b, a, b, a, b, a)
+ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define dual_set_epi16(a, b) \
- _mm_set_epi16(b, b, b, b, a, a, a, a)
+ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
// Constants:
// for (int i = 1; i< 32; ++i)
@@ -78,13 +80,7 @@ static const tran_high_t sinpi_3_9 = 13377;
static const tran_high_t sinpi_4_9 = 15212;
static INLINE tran_low_t check_range(tran_high_t input) {
-#if CONFIG_VP9_HIGHBITDEPTH
- // For valid highbitdepth VP9 streams, intermediate stage coefficients will
- // stay within the ranges:
- // - 8 bit: signed 16 bit integer
- // - 10 bit: signed 18 bit integer
- // - 12 bit: signed 20 bit integer
-#elif CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -93,7 +89,7 @@ static INLINE tran_low_t check_range(tran_high_t input) {
// --enable-coefficient-range-checking.
assert(INT16_MIN <= input);
assert(input <= INT16_MAX);
-#endif
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
return (tran_low_t)input;
}
@@ -102,6 +98,32 @@ static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
return check_range(rv);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+ int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void) int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void) bd;
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+ int bd) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return highbd_check_range(rv, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
typedef struct {
@@ -116,6 +138,28 @@ typedef struct {
} highbd_transform_2d;
#endif // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) (x)
+#endif // CONFIG_EMULATE_HARDWARE
+
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -135,6 +179,9 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob, int bd);
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -151,6 +198,11 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ trans = WRAPLOW(trans, bd);
+ return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
index aca8d7b33bd..69d393ef469 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
@@ -34,10 +34,10 @@
//
// A loopfilter should be applied to every other 8x8 horizontally.
static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
- 0xffffffffffffffff, // TX_4X4
- 0xffffffffffffffff, // TX_8x8
- 0x5555555555555555, // TX_16x16
- 0x1111111111111111, // TX_32x32
+ 0xffffffffffffffffULL, // TX_4X4
+ 0xffffffffffffffffULL, // TX_8x8
+ 0x5555555555555555ULL, // TX_16x16
+ 0x1111111111111111ULL, // TX_32x32
};
// 64 bit masks for above transform size. Each 1 represents a position where
@@ -58,10 +58,10 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
//
// A loopfilter should be applied to every other 4 the row vertically.
static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
- 0xffffffffffffffff, // TX_4X4
- 0xffffffffffffffff, // TX_8x8
- 0x00ff00ff00ff00ff, // TX_16x16
- 0x000000ff000000ff, // TX_32x32
+ 0xffffffffffffffffULL, // TX_4X4
+ 0xffffffffffffffffULL, // TX_8x8
+ 0x00ff00ff00ff00ffULL, // TX_16x16
+ 0x000000ff000000ffULL, // TX_32x32
};
// 64 bit masks for prediction sizes (left). Each 1 represents a position
@@ -80,59 +80,59 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
// 00000000
// 00000000
static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
- 0x0000000000000001, // BLOCK_4X4,
- 0x0000000000000001, // BLOCK_4X8,
- 0x0000000000000001, // BLOCK_8X4,
- 0x0000000000000001, // BLOCK_8X8,
- 0x0000000000000101, // BLOCK_8X16,
- 0x0000000000000001, // BLOCK_16X8,
- 0x0000000000000101, // BLOCK_16X16,
- 0x0000000001010101, // BLOCK_16X32,
- 0x0000000000000101, // BLOCK_32X16,
- 0x0000000001010101, // BLOCK_32X32,
- 0x0101010101010101, // BLOCK_32X64,
- 0x0000000001010101, // BLOCK_64X32,
- 0x0101010101010101, // BLOCK_64X64
+ 0x0000000000000001ULL, // BLOCK_4X4,
+ 0x0000000000000001ULL, // BLOCK_4X8,
+ 0x0000000000000001ULL, // BLOCK_8X4,
+ 0x0000000000000001ULL, // BLOCK_8X8,
+ 0x0000000000000101ULL, // BLOCK_8X16,
+ 0x0000000000000001ULL, // BLOCK_16X8,
+ 0x0000000000000101ULL, // BLOCK_16X16,
+ 0x0000000001010101ULL, // BLOCK_16X32,
+ 0x0000000000000101ULL, // BLOCK_32X16,
+ 0x0000000001010101ULL, // BLOCK_32X32,
+ 0x0101010101010101ULL, // BLOCK_32X64,
+ 0x0000000001010101ULL, // BLOCK_64X32,
+ 0x0101010101010101ULL, // BLOCK_64X64
};
// 64 bit mask to shift and set for each prediction size.
static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
- 0x0000000000000001, // BLOCK_4X4
- 0x0000000000000001, // BLOCK_4X8
- 0x0000000000000001, // BLOCK_8X4
- 0x0000000000000001, // BLOCK_8X8
- 0x0000000000000001, // BLOCK_8X16,
- 0x0000000000000003, // BLOCK_16X8
- 0x0000000000000003, // BLOCK_16X16
- 0x0000000000000003, // BLOCK_16X32,
- 0x000000000000000f, // BLOCK_32X16,
- 0x000000000000000f, // BLOCK_32X32,
- 0x000000000000000f, // BLOCK_32X64,
- 0x00000000000000ff, // BLOCK_64X32,
- 0x00000000000000ff, // BLOCK_64X64
+ 0x0000000000000001ULL, // BLOCK_4X4
+ 0x0000000000000001ULL, // BLOCK_4X8
+ 0x0000000000000001ULL, // BLOCK_8X4
+ 0x0000000000000001ULL, // BLOCK_8X8
+ 0x0000000000000001ULL, // BLOCK_8X16,
+ 0x0000000000000003ULL, // BLOCK_16X8
+ 0x0000000000000003ULL, // BLOCK_16X16
+ 0x0000000000000003ULL, // BLOCK_16X32,
+ 0x000000000000000fULL, // BLOCK_32X16,
+ 0x000000000000000fULL, // BLOCK_32X32,
+ 0x000000000000000fULL, // BLOCK_32X64,
+ 0x00000000000000ffULL, // BLOCK_64X32,
+ 0x00000000000000ffULL, // BLOCK_64X64
};
// 64 bit mask to shift and set for each prediction size. A bit is set for
// each 8x8 block that would be in the left most block of the given block
// size in the 64x64 block.
static const uint64_t size_mask[BLOCK_SIZES] = {
- 0x0000000000000001, // BLOCK_4X4
- 0x0000000000000001, // BLOCK_4X8
- 0x0000000000000001, // BLOCK_8X4
- 0x0000000000000001, // BLOCK_8X8
- 0x0000000000000101, // BLOCK_8X16,
- 0x0000000000000003, // BLOCK_16X8
- 0x0000000000000303, // BLOCK_16X16
- 0x0000000003030303, // BLOCK_16X32,
- 0x0000000000000f0f, // BLOCK_32X16,
- 0x000000000f0f0f0f, // BLOCK_32X32,
- 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64,
- 0x00000000ffffffff, // BLOCK_64X32,
- 0xffffffffffffffff, // BLOCK_64X64
+ 0x0000000000000001ULL, // BLOCK_4X4
+ 0x0000000000000001ULL, // BLOCK_4X8
+ 0x0000000000000001ULL, // BLOCK_8X4
+ 0x0000000000000001ULL, // BLOCK_8X8
+ 0x0000000000000101ULL, // BLOCK_8X16,
+ 0x0000000000000003ULL, // BLOCK_16X8
+ 0x0000000000000303ULL, // BLOCK_16X16
+ 0x0000000003030303ULL, // BLOCK_16X32,
+ 0x0000000000000f0fULL, // BLOCK_32X16,
+ 0x000000000f0f0f0fULL, // BLOCK_32X32,
+ 0x0f0f0f0f0f0f0f0fULL, // BLOCK_32X64,
+ 0x00000000ffffffffULL, // BLOCK_64X32,
+ 0xffffffffffffffffULL, // BLOCK_64X64
};
// These are used for masking the left and above borders.
-static const uint64_t left_border = 0x1111111111111111;
-static const uint64_t above_border = 0x000000ff000000ff;
+static const uint64_t left_border = 0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
// 16 bit masks for uv transform sizes.
static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
@@ -222,9 +222,9 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
if (block_inside_limit < 1)
block_inside_limit = 1;
- vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
- SIMD_WIDTH);
+ memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+ memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
}
}
@@ -245,7 +245,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
// init hev threshold const vectors
for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
- vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+ memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
}
void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -276,7 +276,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
if (!lf->mode_ref_delta_enabled) {
// we could get rid of this if we assume that deltas are set to
// zero when not in use; encoder always uses deltas
- vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+ memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
} else {
int ref, mode;
const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
@@ -293,7 +293,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
}
}
-static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void filter_selectively_vert_row2(int subsampling_factor,
uint8_t *s, int pitch,
unsigned int mask_16x16_l,
unsigned int mask_8x8_l,
@@ -301,9 +301,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
unsigned int mask_4x4_int_l,
const loop_filter_info_n *lfi_n,
const uint8_t *lfl) {
- const int mask_shift = plane_type ? 4 : 8;
- const int mask_cutoff = plane_type ? 0xf : 0xff;
- const int lfl_forward = plane_type ? 4 : 8;
+ const int mask_shift = subsampling_factor ? 4 : 8;
+ const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+ const int lfl_forward = subsampling_factor ? 4 : 8;
unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -393,7 +393,7 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void highbd_filter_selectively_vert_row2(int subsampling_factor,
uint16_t *s, int pitch,
unsigned int mask_16x16_l,
unsigned int mask_8x8_l,
@@ -401,9 +401,9 @@ static void highbd_filter_selectively_vert_row2(PLANE_TYPE plane_type,
unsigned int mask_4x4_int_l,
const loop_filter_info_n *lfi_n,
const uint8_t *lfl, int bd) {
- const int mask_shift = plane_type ? 4 : 8;
- const int mask_cutoff = plane_type ? 0xf : 0xff;
- const int lfl_forward = plane_type ? 4 : 8;
+ const int mask_shift = subsampling_factor ? 4 : 8;
+ const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+ const int lfl_forward = subsampling_factor ? 4 : 8;
unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -727,7 +727,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
const int h = num_8x8_blocks_high_lookup[block_size];
int index = shift_y;
for (i = 0; i < h; i++) {
- vpx_memset(&lfm->lfl_y[index], filter_level, w);
+ memset(&lfm->lfl_y[index], filter_level, w);
index += 8;
}
}
@@ -773,7 +773,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
// an 8x8 in that the internal ones can be skipped and don't depend on
// the prediction block size.
if (tx_size_y == TX_4X4)
- *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+ *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
if (tx_size_uv == TX_4X4)
*int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
@@ -801,7 +801,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
const int h = num_8x8_blocks_high_lookup[block_size];
int index = shift_y;
for (i = 0; i < h; i++) {
- vpx_memset(&lfm->lfl_y[index], filter_level, w);
+ memset(&lfm->lfl_y[index], filter_level, w);
index += 8;
}
}
@@ -819,19 +819,19 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
left_64x64_txform_mask[tx_size_y]) << shift_y;
if (tx_size_y == TX_4X4)
- *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+ *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
}
// This function sets up the bit masks for the entire 64x64 region represented
// by mi_row, mi_col.
// TODO(JBB): This function only works for yv12.
void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
- MODE_INFO *mi, const int mode_info_stride,
+ MODE_INFO **mi, const int mode_info_stride,
LOOP_FILTER_MASK *lfm) {
int idx_32, idx_16, idx_8;
const loop_filter_info_n *const lfi_n = &cm->lf_info;
- MODE_INFO *mip = mi;
- MODE_INFO *mip2 = mi;
+ MODE_INFO **mip = mi;
+ MODE_INFO **mip2 = mi;
// These are offsets to the next mi in the 64x64 block. It is what gets
// added to the mi ptr as we go through each loop. It helps us to avoid
@@ -859,28 +859,28 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
cm->mi_cols - mi_col : MI_BLOCK_SIZE);
vp9_zero(*lfm);
- assert(mip != NULL);
+ assert(mip[0] != NULL);
// TODO(jimbankoski): Try moving most of the following code into decode
// loop and storing lfm in the mbmi structure so that we don't have to go
// through the recursive loop structure multiple times.
- switch (mip->mbmi.sb_type) {
+ switch (mip[0]->mbmi.sb_type) {
case BLOCK_64X64:
- build_masks(lfi_n, mip , 0, 0, lfm);
+ build_masks(lfi_n, mip[0] , 0, 0, lfm);
break;
case BLOCK_64X32:
- build_masks(lfi_n, mip, 0, 0, lfm);
+ build_masks(lfi_n, mip[0], 0, 0, lfm);
mip2 = mip + mode_info_stride * 4;
if (4 >= max_rows)
break;
- build_masks(lfi_n, mip2, 32, 8, lfm);
+ build_masks(lfi_n, mip2[0], 32, 8, lfm);
break;
case BLOCK_32X64:
- build_masks(lfi_n, mip, 0, 0, lfm);
+ build_masks(lfi_n, mip[0], 0, 0, lfm);
mip2 = mip + 4;
if (4 >= max_cols)
break;
- build_masks(lfi_n, mip2, 4, 2, lfm);
+ build_masks(lfi_n, mip2[0], 4, 2, lfm);
break;
default:
for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
@@ -890,23 +890,23 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
const int mi_32_row_offset = ((idx_32 >> 1) << 2);
if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
continue;
- switch (mip->mbmi.sb_type) {
+ switch (mip[0]->mbmi.sb_type) {
case BLOCK_32X32:
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
break;
case BLOCK_32X16:
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
if (mi_32_row_offset + 2 >= max_rows)
continue;
mip2 = mip + mode_info_stride * 2;
- build_masks(lfi_n, mip2, shift_y + 16, shift_uv + 4, lfm);
+ build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
break;
case BLOCK_16X32:
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
if (mi_32_col_offset + 2 >= max_cols)
continue;
mip2 = mip + 2;
- build_masks(lfi_n, mip2, shift_y + 2, shift_uv + 1, lfm);
+ build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
break;
default:
for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
@@ -920,29 +920,29 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
continue;
- switch (mip->mbmi.sb_type) {
+ switch (mip[0]->mbmi.sb_type) {
case BLOCK_16X16:
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
break;
case BLOCK_16X8:
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
if (mi_16_row_offset + 1 >= max_rows)
continue;
mip2 = mip + mode_info_stride;
- build_y_mask(lfi_n, mip2, shift_y+8, lfm);
+ build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
break;
case BLOCK_8X16:
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
if (mi_16_col_offset +1 >= max_cols)
continue;
mip2 = mip + 1;
- build_y_mask(lfi_n, mip2, shift_y+1, lfm);
+ build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
break;
default: {
const int shift_y = shift_32_y[idx_32] +
shift_16_y[idx_16] +
shift_8_y[0];
- build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
mip += offset[0];
for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
const int shift_y = shift_32_y[idx_32] +
@@ -956,7 +956,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
if (mi_8_col_offset >= max_cols ||
mi_8_row_offset >= max_rows)
continue;
- build_y_mask(lfi_n, mip, shift_y, lfm);
+ build_y_mask(lfi_n, mip[0], shift_y, lfm);
}
break;
}
@@ -968,7 +968,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
break;
}
// The largest loopfilter we have is 16x16 so we use the 16x16 mask
- // for 32x32 transforms also also.
+ // for 32x32 transforms also.
lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
@@ -1021,7 +1021,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
// Each pixel inside the border gets a 1, the multiply copies the border
// to where we need it.
- const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101;
+ const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL;
const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
// Internal edges are not applied on the last column of the image so
@@ -1053,7 +1053,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
// out.
if (mi_col == 0) {
for (i = 0; i < TX_32X32; i++) {
- lfm->left_y[i] &= 0xfefefefefefefefe;
+ lfm->left_y[i] &= 0xfefefefefefefefeULL;
lfm->left_uv[i] &= 0xeeee;
}
}
@@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-static void filter_block_plane_non420(VP9_COMMON *cm,
- struct macroblockd_plane *plane,
- MODE_INFO *mi_8x8,
- int mi_row, int mi_col) {
+void vp9_filter_block_plane_non420(VP9_COMMON *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mi_8x8,
+ int mi_row, int mi_col) {
const int ss_x = plane->subsampling_x;
const int ss_y = plane->subsampling_y;
const int row_step = 1 << ss_y;
@@ -1175,7 +1175,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
// Determine the vertical edges that need filtering
for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
- const MODE_INFO *mi = mi_8x8[c].src_mi;
+ const MODE_INFO *mi = mi_8x8[c];
const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
// left edge of current unit is block/partition edge -> no skip
@@ -1326,248 +1326,203 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
}
}
-void vp9_filter_block_plane(VP9_COMMON *const cm,
- struct macroblockd_plane *const plane,
- int mi_row,
- LOOP_FILTER_MASK *lfm) {
+void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row,
+ LOOP_FILTER_MASK *lfm) {
struct buf_2d *const dst = &plane->dst;
- uint8_t* const dst0 = dst->buf;
- int r, c;
+ uint8_t *const dst0 = dst->buf;
+ int r;
+ uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+ uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+ uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+ uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+ assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
+
+ // Vertical pass: do 2 rows at one time
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+ unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+ unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+ unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+ unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ highbd_filter_selectively_vert_row2(
+ plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+ mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+ &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+ } else {
+ filter_selectively_vert_row2(
+ plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+ mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+ }
+#else
+ filter_selectively_vert_row2(
+ plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+ mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ dst->buf += 16 * dst->stride;
+ mask_16x16 >>= 16;
+ mask_8x8 >>= 16;
+ mask_4x4 >>= 16;
+ mask_4x4_int >>= 16;
+ }
- if (!plane->plane_type) {
- uint64_t mask_16x16 = lfm->left_y[TX_16X16];
- uint64_t mask_8x8 = lfm->left_y[TX_8X8];
- uint64_t mask_4x4 = lfm->left_y[TX_4X4];
- uint64_t mask_4x4_int = lfm->int_4x4_y;
+ // Horizontal pass
+ dst->buf = dst0;
+ mask_16x16 = lfm->above_y[TX_16X16];
+ mask_8x8 = lfm->above_y[TX_8X8];
+ mask_4x4 = lfm->above_y[TX_4X4];
+ mask_4x4_int = lfm->int_4x4_y;
+
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+ unsigned int mask_16x16_r;
+ unsigned int mask_8x8_r;
+ unsigned int mask_4x4_r;
- // Vertical pass: do 2 rows at one time
- for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
- unsigned int mask_16x16_l = mask_16x16 & 0xffff;
- unsigned int mask_8x8_l = mask_8x8 & 0xffff;
- unsigned int mask_4x4_l = mask_4x4 & 0xffff;
- unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+ if (mi_row + r == 0) {
+ mask_16x16_r = 0;
+ mask_8x8_r = 0;
+ mask_4x4_r = 0;
+ } else {
+ mask_16x16_r = mask_16x16 & 0xff;
+ mask_8x8_r = mask_8x8 & 0xff;
+ mask_4x4_r = mask_4x4 & 0xff;
+ }
- // Disable filtering on the leftmost column.
#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- highbd_filter_selectively_vert_row2(plane->plane_type,
- CONVERT_TO_SHORTPTR(dst->buf),
- dst->stride,
- mask_16x16_l,
- mask_8x8_l,
- mask_4x4_l,
- mask_4x4_int_l,
- &cm->lf_info, &lfm->lfl_y[r << 3],
- (int)cm->bit_depth);
- } else {
- filter_selectively_vert_row2(plane->plane_type,
- dst->buf, dst->stride,
- mask_16x16_l,
- mask_8x8_l,
- mask_4x4_l,
- mask_4x4_int_l,
- &cm->lf_info,
- &lfm->lfl_y[r << 3]);
- }
+ if (cm->use_highbitdepth) {
+ highbd_filter_selectively_horiz(
+ CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+ mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+ mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+ &lfm->lfl_y[r << 3]);
+ }
#else
- filter_selectively_vert_row2(plane->plane_type,
- dst->buf, dst->stride,
- mask_16x16_l,
- mask_8x8_l,
- mask_4x4_l,
- mask_4x4_int_l,
- &cm->lf_info, &lfm->lfl_y[r << 3]);
+ filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+ mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+ &lfm->lfl_y[r << 3]);
#endif // CONFIG_VP9_HIGHBITDEPTH
- dst->buf += 16 * dst->stride;
- mask_16x16 >>= 16;
- mask_8x8 >>= 16;
- mask_4x4 >>= 16;
- mask_4x4_int >>= 16;
- }
- // Horizontal pass
- dst->buf = dst0;
- mask_16x16 = lfm->above_y[TX_16X16];
- mask_8x8 = lfm->above_y[TX_8X8];
- mask_4x4 = lfm->above_y[TX_4X4];
- mask_4x4_int = lfm->int_4x4_y;
-
- for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
- unsigned int mask_16x16_r;
- unsigned int mask_8x8_r;
- unsigned int mask_4x4_r;
-
- if (mi_row + r == 0) {
- mask_16x16_r = 0;
- mask_8x8_r = 0;
- mask_4x4_r = 0;
- } else {
- mask_16x16_r = mask_16x16 & 0xff;
- mask_8x8_r = mask_8x8 & 0xff;
- mask_4x4_r = mask_4x4 & 0xff;
+ dst->buf += 8 * dst->stride;
+ mask_16x16 >>= 8;
+ mask_8x8 >>= 8;
+ mask_4x4 >>= 8;
+ mask_4x4_int >>= 8;
+ }
+}
+
+void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row,
+ LOOP_FILTER_MASK *lfm) {
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ int r, c;
+
+ uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+ uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+ uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+ uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+ assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+
+ // Vertical pass: do 2 rows at one time
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+ if (plane->plane_type == 1) {
+ for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+ lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+ lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
}
+ }
+
+ {
+ unsigned int mask_16x16_l = mask_16x16 & 0xff;
+ unsigned int mask_8x8_l = mask_8x8 & 0xff;
+ unsigned int mask_4x4_l = mask_4x4 & 0xff;
+ unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+// Disable filtering on the leftmost column.
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
- dst->stride,
- mask_16x16_r,
- mask_8x8_r,
- mask_4x4_r,
- mask_4x4_int & 0xff,
- &cm->lf_info,
- &lfm->lfl_y[r << 3],
- (int)cm->bit_depth);
+ highbd_filter_selectively_vert_row2(
+ plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+ mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+ &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
} else {
- filter_selectively_horiz(dst->buf, dst->stride,
- mask_16x16_r,
- mask_8x8_r,
- mask_4x4_r,
- mask_4x4_int & 0xff,
- &cm->lf_info,
- &lfm->lfl_y[r << 3]);
+ filter_selectively_vert_row2(
+ plane->subsampling_x, dst->buf, dst->stride,
+ mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
}
#else
- filter_selectively_horiz(dst->buf, dst->stride,
- mask_16x16_r,
- mask_8x8_r,
- mask_4x4_r,
- mask_4x4_int & 0xff,
- &cm->lf_info,
- &lfm->lfl_y[r << 3]);
+ filter_selectively_vert_row2(
+ plane->subsampling_x, dst->buf, dst->stride,
+ mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
#endif // CONFIG_VP9_HIGHBITDEPTH
- dst->buf += 8 * dst->stride;
+ dst->buf += 16 * dst->stride;
mask_16x16 >>= 8;
mask_8x8 >>= 8;
mask_4x4 >>= 8;
mask_4x4_int >>= 8;
}
- } else {
- uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
- uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
- uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
- uint16_t mask_4x4_int = lfm->int_4x4_uv;
-
- // Vertical pass: do 2 rows at one time
- for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
- if (plane->plane_type == 1) {
- for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
- lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
- lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) +
- (c << 1)];
- }
- }
-
- {
- unsigned int mask_16x16_l = mask_16x16 & 0xff;
- unsigned int mask_8x8_l = mask_8x8 & 0xff;
- unsigned int mask_4x4_l = mask_4x4 & 0xff;
- unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+ }
- // Disable filtering on the leftmost column.
-#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- highbd_filter_selectively_vert_row2(plane->plane_type,
- CONVERT_TO_SHORTPTR(dst->buf),
- dst->stride,
- mask_16x16_l,
- mask_8x8_l,
- mask_4x4_l,
- mask_4x4_int_l,
- &cm->lf_info,
- &lfm->lfl_uv[r << 1],
- (int)cm->bit_depth);
- } else {
- filter_selectively_vert_row2(plane->plane_type,
- dst->buf, dst->stride,
- mask_16x16_l,
- mask_8x8_l,
- mask_4x4_l,
- mask_4x4_int_l,
- &cm->lf_info,
- &lfm->lfl_uv[r << 1]);
- }
-#else
- filter_selectively_vert_row2(plane->plane_type,
- dst->buf, dst->stride,
- mask_16x16_l,
- mask_8x8_l,
- mask_4x4_l,
- mask_4x4_int_l,
- &cm->lf_info,
- &lfm->lfl_uv[r << 1]);
-#endif // CONFIG_VP9_HIGHBITDEPTH
+ // Horizontal pass
+ dst->buf = dst0;
+ mask_16x16 = lfm->above_uv[TX_16X16];
+ mask_8x8 = lfm->above_uv[TX_8X8];
+ mask_4x4 = lfm->above_uv[TX_4X4];
+ mask_4x4_int = lfm->int_4x4_uv;
+
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+ const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+ const unsigned int mask_4x4_int_r =
+ skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+ unsigned int mask_16x16_r;
+ unsigned int mask_8x8_r;
+ unsigned int mask_4x4_r;
- dst->buf += 16 * dst->stride;
- mask_16x16 >>= 8;
- mask_8x8 >>= 8;
- mask_4x4 >>= 8;
- mask_4x4_int >>= 8;
- }
+ if (mi_row + r == 0) {
+ mask_16x16_r = 0;
+ mask_8x8_r = 0;
+ mask_4x4_r = 0;
+ } else {
+ mask_16x16_r = mask_16x16 & 0xf;
+ mask_8x8_r = mask_8x8 & 0xf;
+ mask_4x4_r = mask_4x4 & 0xf;
}
- // Horizontal pass
- dst->buf = dst0;
- mask_16x16 = lfm->above_uv[TX_16X16];
- mask_8x8 = lfm->above_uv[TX_8X8];
- mask_4x4 = lfm->above_uv[TX_4X4];
- mask_4x4_int = lfm->int_4x4_uv;
-
- for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
- const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
- const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
- 0 : (mask_4x4_int & 0xf);
- unsigned int mask_16x16_r;
- unsigned int mask_8x8_r;
- unsigned int mask_4x4_r;
-
- if (mi_row + r == 0) {
- mask_16x16_r = 0;
- mask_8x8_r = 0;
- mask_4x4_r = 0;
- } else {
- mask_16x16_r = mask_16x16 & 0xf;
- mask_8x8_r = mask_8x8 & 0xf;
- mask_4x4_r = mask_4x4 & 0xf;
- }
-
#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
- dst->stride,
- mask_16x16_r,
- mask_8x8_r,
- mask_4x4_r,
- mask_4x4_int_r,
- &cm->lf_info,
- &lfm->lfl_uv[r << 1],
- (int)cm->bit_depth);
- } else {
- filter_selectively_horiz(dst->buf, dst->stride,
- mask_16x16_r,
- mask_8x8_r,
- mask_4x4_r,
- mask_4x4_int_r,
- &cm->lf_info,
- &lfm->lfl_uv[r << 1]);
- }
-#else
- filter_selectively_horiz(dst->buf, dst->stride,
- mask_16x16_r,
- mask_8x8_r,
- mask_4x4_r,
- mask_4x4_int_r,
- &cm->lf_info,
+ if (cm->use_highbitdepth) {
+ highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride, mask_16x16_r, mask_8x8_r,
+ mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+ &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+ } else {
+ filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+ mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
&lfm->lfl_uv[r << 1]);
+ }
+#else
+ filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+ mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
#endif // CONFIG_VP9_HIGHBITDEPTH
- dst->buf += 8 * dst->stride;
- mask_16x16 >>= 4;
- mask_8x8 >>= 4;
- mask_4x4 >>= 4;
- mask_4x4_int >>= 4;
- }
+ dst->buf += 8 * dst->stride;
+ mask_16x16 >>= 4;
+ mask_8x8 >>= 4;
+ mask_4x4 >>= 4;
+ mask_4x4_int >>= 4;
}
}
@@ -1576,13 +1531,21 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
- const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
- planes[1].subsampling_x == 1);
+ enum lf_path path;
LOOP_FILTER_MASK lfm;
int mi_row, mi_col;
+ if (y_only)
+ path = LF_PATH_444;
+ else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+ path = LF_PATH_420;
+ else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+ path = LF_PATH_444;
+ else
+ path = LF_PATH_SLOW;
+
for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
- MODE_INFO *mi = cm->mi + mi_row * cm->mi_stride;
+ MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
int plane;
@@ -1590,16 +1553,23 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
// TODO(JBB): Make setup_mask work for non 420.
- if (use_420)
- vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
- &lfm);
-
- for (plane = 0; plane < num_planes; ++plane) {
- if (use_420)
- vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
- else
- filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
- mi_row, mi_col);
+ vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+ &lfm);
+
+ vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+ for (plane = 1; plane < num_planes; ++plane) {
+ switch (path) {
+ case LF_PATH_420:
+ vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_444:
+ vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_SLOW:
+ vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ break;
+ }
}
}
}
@@ -1625,6 +1595,17 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
y_only);
}
+void vp9_loop_filter_data_reset(
+ LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+ struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+ lf_data->frame_buffer = frame_buffer;
+ lf_data->cm = cm;
+ lf_data->start = 0;
+ lf_data->stop = 0;
+ lf_data->y_only = 0;
+ memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
(void)unused;
vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
index 0ede58ae481..f7cbde678de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
@@ -29,6 +29,12 @@ extern "C" {
#define MAX_REF_LF_DELTAS 4
#define MAX_MODE_LF_DELTAS 2
+enum lf_path {
+ LF_PATH_420,
+ LF_PATH_444,
+ LF_PATH_SLOW,
+};
+
struct loopfilter {
int filter_level;
@@ -89,13 +95,23 @@ struct VP9LfSyncData;
// by mi_row, mi_col.
void vp9_setup_mask(struct VP9Common *const cm,
const int mi_row, const int mi_col,
- MODE_INFO *mi_8x8, const int mode_info_stride,
+ MODE_INFO **mi_8x8, const int mode_info_stride,
LOOP_FILTER_MASK *lfm);
-void vp9_filter_block_plane(struct VP9Common *const cm,
- struct macroblockd_plane *const plane,
- int mi_row,
- LOOP_FILTER_MASK *lfm);
+void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row,
+ LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_ss11(struct VP9Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row,
+ LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_non420(struct VP9Common *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mi_8x8,
+ int mi_row, int mi_col);
void vp9_loop_filter_init(struct VP9Common *cm);
@@ -124,11 +140,12 @@ typedef struct LoopFilterWorkerData {
int start;
int stop;
int y_only;
-
- struct VP9LfSyncData *lf_sync;
- int num_lf_workers;
} LFWorkerData;
+void vp9_loop_filter_data_reset(
+ LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+ struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
// Operates on the rows described by 'lf_data'.
int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
#ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c
new file mode 100644
index 00000000000..57189df16ec
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int block_size, int src_weight) {
+ const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+ int r, c;
+
+ for (r = 0; r < block_size; r++) {
+ for (c = 0; c < block_size; c++) {
+ dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+ >> MFQE_PRECISION;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int weight) {
+ vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+ vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+ weight);
+ vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
+ dst + dst_stride * 16, dst_stride, weight);
+ vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int weight) {
+ filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+ filter_by_weight32x32(src + 32, src_stride, dst + 32,
+ dst_stride, weight);
+ filter_by_weight32x32(src + src_stride * 32, src_stride,
+ dst + dst_stride * 32, dst_stride, weight);
+ filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+ dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+ int yd_stride, const uint8_t *u, const uint8_t *v,
+ int uv_stride, uint8_t *ud, uint8_t *vd,
+ int uvd_stride, BLOCK_SIZE block_size,
+ int weight) {
+ if (block_size == BLOCK_16X16) {
+ vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+ vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+ vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
+ } else if (block_size == BLOCK_32X32) {
+ filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+ vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+ vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
+ } else if (block_size == BLOCK_64X64) {
+ filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+ filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+ filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+ }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ int r;
+ for (r = 0; r < 8; r++) {
+ memcpy(dst, src, 8);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ int r;
+ for (r = 0; r < 16; r++) {
+ memcpy(dst, src, 16);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ copy_mem16x16(src, src_stride, dst, dst_stride);
+ copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+ copy_mem16x16(src + src_stride * 16, src_stride,
+ dst + dst_stride * 16, dst_stride);
+ copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride);
+}
+
+void copy_mem64x64(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ copy_mem32x32(src, src_stride, dst, dst_stride);
+ copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+ copy_mem32x32(src + src_stride * 32, src_stride,
+ dst + src_stride * 32, dst_stride);
+ copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+ dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+ uint8_t *vd, int yd_stride, int uvd_stride,
+ BLOCK_SIZE bs) {
+ if (bs == BLOCK_16X16) {
+ copy_mem16x16(y, y_stride, yd, yd_stride);
+ copy_mem8x8(u, uv_stride, ud, uvd_stride);
+ copy_mem8x8(v, uv_stride, vd, uvd_stride);
+ } else if (bs == BLOCK_32X32) {
+ copy_mem32x32(y, y_stride, yd, yd_stride);
+ copy_mem16x16(u, uv_stride, ud, uvd_stride);
+ copy_mem16x16(v, uv_stride, vd, uvd_stride);
+ } else {
+ copy_mem64x64(y, y_stride, yd, yd_stride);
+ copy_mem32x32(u, uv_stride, ud, uvd_stride);
+ copy_mem32x32(v, uv_stride, vd, uvd_stride);
+ }
+}
+
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+ const int adj = qdiff >> MFQE_PRECISION;
+ if (bs == BLOCK_16X16) {
+ *sad_thr = 7 + adj;
+ } else if (bs == BLOCK_32X32) {
+ *sad_thr = 6 + adj;
+ } else { // BLOCK_64X64
+ *sad_thr = 5 + adj;
+ }
+ *vdiff_thr = 125 + qdiff;
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, int y_stride, int uv_stride,
+ uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+ int uvd_stride, int qdiff) {
+ int sad, sad_thr, vdiff, vdiff_thr;
+ uint32_t sse;
+
+ get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
+ if (bs == BLOCK_16X16) {
+ vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+ sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+ } else if (bs == BLOCK_32X32) {
+ vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+ sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+ } else /* if (bs == BLOCK_64X64) */ {
+ vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+ sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+ }
+
+ // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+ // it might be a lighting change in smooth area. When there is a
+ // lighting change in smooth area, it is dangerous to do MFQE.
+ if (sad > 1 && vdiff > sad * 3) {
+ const int weight = 1 << MFQE_PRECISION;
+ int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+ // When ifactor equals weight, no MFQE is done.
+ if (ifactor > weight) {
+ ifactor = weight;
+ }
+ apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+ uvd_stride, bs, ifactor);
+ } else {
+ // Copy the block from current frame (i.e., no mfqe is done).
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+ // Check the motion in current block(for inter frame),
+ // or check the motion in the correlated block in last frame (for keyframe).
+ const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+ mi->mbmi.mv[0].as_mv.row +
+ mi->mbmi.mv[0].as_mv.col *
+ mi->mbmi.mv[0].as_mv.col;
+ const int mv_threshold = 100;
+ return mi->mbmi.mode >= NEARESTMV && // Not an intra block
+ cur_bs >= BLOCK_16X16 &&
+ mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+ const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, int y_stride, int uv_stride,
+ uint8_t *yd, uint8_t *ud, uint8_t *vd,
+ int yd_stride, int uvd_stride) {
+ int mi_offset, y_offset, uv_offset;
+ const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+ const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+ const int bsl = b_width_log2_lookup[bs];
+ PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+ const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+ if (cur_bs < BLOCK_8X8) {
+ // If there are blocks smaller than 8x8, it must be on the boundary.
+ return;
+ }
+ // No MFQE on blocks smaller than 16x16
+ if (bs == BLOCK_16X16) {
+ partition = PARTITION_NONE;
+ }
+ if (bs == BLOCK_64X64) {
+ mi_offset = 4;
+ y_offset = 32;
+ uv_offset = 16;
+ } else {
+ mi_offset = 2;
+ y_offset = 16;
+ uv_offset = 8;
+ }
+ switch (partition) {
+ BLOCK_SIZE mfqe_bs, bs_tmp;
+ case PARTITION_HORZ:
+ if (bs == BLOCK_64X64) {
+ mfqe_bs = BLOCK_64X32;
+ bs_tmp = BLOCK_32X32;
+ } else {
+ mfqe_bs = BLOCK_32X16;
+ bs_tmp = BLOCK_16X16;
+ }
+ if (mfqe_decision(mi, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+ y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+ vd + uv_offset, yd_stride, uvd_stride, qdiff);
+ }
+ if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride, qdiff);
+ }
+ break;
+ case PARTITION_VERT:
+ if (bs == BLOCK_64X64) {
+ mfqe_bs = BLOCK_32X64;
+ bs_tmp = BLOCK_32X32;
+ } else {
+ mfqe_bs = BLOCK_16X32;
+ bs_tmp = BLOCK_16X16;
+ }
+ if (mfqe_decision(mi, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+ }
+ if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+ // Do mfqe on the first square partition.
+ mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+ y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+ vd + uv_offset, yd_stride, uvd_stride, qdiff);
+ // Do mfqe on the second square partition.
+ mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride, qdiff);
+ }
+ break;
+ case PARTITION_NONE:
+ if (mfqe_decision(mi, cur_bs)) {
+ // Do mfqe on this partition.
+ mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
+ } else {
+ // Copy the block from current frame(i.e., no mfqe is done).
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ }
+ break;
+ case PARTITION_SPLIT:
+ // Recursion on four square partitions, e.g. if bs is 64X64,
+ // then look into four 32X32 blocks in it.
+ mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+ v + uv_offset, y_stride, uv_stride, yd + y_offset,
+ ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+ y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+ subsize, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride);
+ break;
+ default:
+ assert(0);
+ }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+ int mi_row, mi_col;
+ // Current decoded frame.
+ const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+ // Last decoded frame and will store the MFQE result.
+ YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+ // Loop through each super block.
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+ MODE_INFO *mi;
+ MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+ // Motion Info in last frame.
+ MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+ (mi_row * cm->mi_stride + mi_col);
+ const uint32_t y_stride = show->y_stride;
+ const uint32_t uv_stride = show->uv_stride;
+ const uint32_t yd_stride = dest->y_stride;
+ const uint32_t uvd_stride = dest->uv_stride;
+ const uint32_t row_offset_y = mi_row << 3;
+ const uint32_t row_offset_uv = mi_row << 2;
+ const uint32_t col_offset_y = mi_col << 3;
+ const uint32_t col_offset_uv = mi_col << 2;
+ const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+ col_offset_y;
+ const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+ col_offset_uv;
+ const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+ col_offset_uv;
+ uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+ uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+ col_offset_uv;
+ uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+ col_offset_uv;
+ if (frame_is_intra_only(cm)) {
+ mi = mi_prev;
+ } else {
+ mi = mi_local;
+ }
+ mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+ vd, yd_stride, uvd_stride);
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h
new file mode 100644
index 00000000000..dfff8c23d65
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP9_COMMON_VP9_MFQE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
index 3b34050a840..51e147e0056 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
@@ -17,19 +17,18 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
- int block, int mi_row, int mi_col) {
+ int block, int mi_row, int mi_col,
+ find_mv_refs_sync sync, void *const data) {
const int *ref_sign_bias = cm->ref_frame_sign_bias;
int i, refmv_count = 0;
- const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
- ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi
- : NULL;
- const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL;
const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
int different_ref_found = 0;
int context_counter = 0;
+ const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
+ cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
// Blank the reference vector list
- vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+ memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
// The nearest 2 blocks are treated differently
// if the size < 8x8 we get the mv from the bmi substructure,
@@ -38,16 +37,18 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const POSITION *const mv_ref = &mv_ref_search[i];
if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
- xd->mi_stride].src_mi;
+ xd->mi_stride];
const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
// Keep counts for entropy encoding.
context_counter += mode_2_counter[candidate->mode];
different_ref_found = 1;
if (candidate->ref_frame[0] == ref_frame)
- ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block));
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+ refmv_count, mv_ref_list, Done);
else if (candidate->ref_frame[1] == ref_frame)
- ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block));
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+ refmv_count, mv_ref_list, Done);
}
}
@@ -58,22 +59,38 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const POSITION *const mv_ref = &mv_ref_search[i];
if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
- xd->mi_stride].src_mi->mbmi;
+ xd->mi_stride]->mbmi;
different_ref_found = 1;
if (candidate->ref_frame[0] == ref_frame)
- ADD_MV_REF_LIST(candidate->mv[0]);
+ ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
else if (candidate->ref_frame[1] == ref_frame)
- ADD_MV_REF_LIST(candidate->mv[1]);
+ ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, Done);
}
}
+ // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+ // on windows platform. The sync here is unncessary if use_perv_frame_mvs
+ // is 0. But after removing it, there will be hang in the unit test on windows
+ // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+ if (cm->frame_parallel_decode && sync != NULL) {
+ sync(data, mi_row);
+ }
+#endif
+
// Check the last frame's mode and mv info.
- if (prev_mbmi) {
- if (prev_mbmi->ref_frame[0] == ref_frame)
- ADD_MV_REF_LIST(prev_mbmi->mv[0]);
- else if (prev_mbmi->ref_frame[1] == ref_frame)
- ADD_MV_REF_LIST(prev_mbmi->mv[1]);
+ if (cm->use_prev_frame_mvs) {
+ // Synchronize here for frame parallel decode if sync function is provided.
+ if (cm->frame_parallel_decode && sync != NULL) {
+ sync(data, mi_row);
+ }
+
+ if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+ ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+ } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+ ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+ }
}
// Since we couldn't find 2 mvs from the same reference frame
@@ -84,17 +101,40 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const POSITION *mv_ref = &mv_ref_search[i];
if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
- * xd->mi_stride].src_mi->mbmi;
+ * xd->mi_stride]->mbmi;
// If the candidate is INTRA we don't want to consider its mv.
- IF_DIFF_REF_FRAME_ADD_MV(candidate);
+ IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+ refmv_count, mv_ref_list, Done);
}
}
}
// Since we still don't have a candidate we'll try the last frame.
- if (prev_mbmi)
- IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);
+ if (cm->use_prev_frame_mvs) {
+ if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+ prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+ int_mv mv = prev_frame_mvs->mv[0];
+ if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+ ref_sign_bias[ref_frame]) {
+ mv.as_mv.row *= -1;
+ mv.as_mv.col *= -1;
+ }
+ ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+ }
+
+ if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+ prev_frame_mvs->ref_frame[1] != ref_frame &&
+ prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+ int_mv mv = prev_frame_mvs->mv[1];
+ if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+ ref_sign_bias[ref_frame]) {
+ mv.as_mv.row *= -1;
+ mv.as_mv.col *= -1;
+ }
+ ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+ }
+ }
Done:
@@ -109,9 +149,10 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
- int mi_row, int mi_col) {
+ int mi_row, int mi_col,
+ find_mv_refs_sync sync, void *const data) {
find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
- mi_row, mi_col);
+ mi_row, mi_col, sync, data);
}
static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -125,43 +166,44 @@ static void lower_mv_precision(MV *mv, int allow_hp) {
}
void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
- int_mv *mvlist, int_mv *nearest, int_mv *near) {
+ int_mv *mvlist, int_mv *nearest_mv,
+ int_mv *near_mv) {
int i;
// Make sure all the candidates are properly clamped etc
for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
lower_mv_precision(&mvlist[i].as_mv, allow_hp);
clamp_mv2(&mvlist[i].as_mv, xd);
}
- *nearest = mvlist[0];
- *near = mvlist[1];
+ *nearest_mv = mvlist[0];
+ *near_mv = mvlist[1];
}
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
const TileInfo *const tile,
int block, int ref, int mi_row, int mi_col,
- int_mv *nearest, int_mv *near) {
+ int_mv *nearest_mv, int_mv *near_mv) {
int_mv mv_list[MAX_MV_REF_CANDIDATES];
- MODE_INFO *const mi = xd->mi[0].src_mi;
+ MODE_INFO *const mi = xd->mi[0];
b_mode_info *bmi = mi->bmi;
int n;
assert(MAX_MV_REF_CANDIDATES == 2);
find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
- mi_row, mi_col);
+ mi_row, mi_col, NULL, NULL);
- near->as_int = 0;
+ near_mv->as_int = 0;
switch (block) {
case 0:
- nearest->as_int = mv_list[0].as_int;
- near->as_int = mv_list[1].as_int;
+ nearest_mv->as_int = mv_list[0].as_int;
+ near_mv->as_int = mv_list[1].as_int;
break;
case 1:
case 2:
- nearest->as_int = bmi[0].as_mv[ref].as_int;
+ nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
- if (nearest->as_int != mv_list[n].as_int) {
- near->as_int = mv_list[n].as_int;
+ if (nearest_mv->as_int != mv_list[n].as_int) {
+ near_mv->as_int = mv_list[n].as_int;
break;
}
break;
@@ -172,10 +214,10 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
candidates[2] = mv_list[0];
candidates[3] = mv_list[1];
- nearest->as_int = bmi[2].as_mv[ref].as_int;
+ nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
- if (nearest->as_int != candidates[n].as_int) {
- near->as_int = candidates[n].as_int;
+ if (nearest_mv->as_int != candidates[n].as_int) {
+ near_mv->as_int = candidates[n].as_int;
break;
}
break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
index a937b7823a5..f1df521468f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
@@ -158,29 +158,32 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
// This macro is used to add a motion vector mv_ref list if it isn't
// already in the list. If it's the second motion vector it will also
// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
do { \
if (refmv_count) { \
- if ((mv).as_int != mv_ref_list[0].as_int) { \
- mv_ref_list[refmv_count] = (mv); \
+ if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+ (mv_ref_list)[(refmv_count)] = (mv); \
goto Done; \
} \
} else { \
- mv_ref_list[refmv_count++] = (mv); \
+ (mv_ref_list)[(refmv_count)++] = (mv); \
} \
} while (0)
// If either reference frame is different, not INTRA, and they
// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
+ mv_ref_list, Done) \
do { \
if (is_inter_block(mbmi)) { \
if ((mbmi)->ref_frame[0] != ref_frame) \
- ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+ ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+ refmv_count, mv_ref_list, Done); \
if (has_second_ref(mbmi) && \
(mbmi)->ref_frame[1] != ref_frame && \
(mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
- ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+ ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+ refmv_count, mv_ref_list, Done); \
} \
} while (0)
@@ -204,21 +207,23 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
}
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
- int_mv *mv_ref_list, int mi_row, int mi_col);
+ int_mv *mv_ref_list, int mi_row, int mi_col,
+ find_mv_refs_sync sync, void *const data);
// check a list of motion vectors by sad score using a number rows of pixels
// above and a number cols of pixels in the left to select the one with best
// score to use as ref motion vector
void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
- int_mv *mvlist, int_mv *nearest, int_mv *near);
+ int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
const TileInfo *const tile,
int block, int ref, int mi_row, int mi_col,
- int_mv *nearest, int_mv *near);
+ int_mv *nearest_mv, int_mv *near_mv);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
index f1eda911737..5179c690633 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
@@ -20,6 +20,7 @@
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_frame_buffers.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_thread.h"
#include "vp9/common/vp9_tile_common.h"
#if CONFIG_VP9_POSTPROC
@@ -35,14 +36,19 @@ extern "C" {
#define REF_FRAMES_LOG2 3
#define REF_FRAMES (1 << REF_FRAMES_LOG2)
-// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
// TODO(jkoleszar): These 3 extra references could probably come from the
// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 7)
#define FRAME_CONTEXTS_LOG2 2
#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
+#define NUM_PING_PONG_BUFFERS 2
+
extern const struct {
PARTITION_CONTEXT above;
PARTITION_CONTEXT left;
@@ -56,21 +62,55 @@ typedef enum {
REFERENCE_MODES = 3,
} REFERENCE_MODE;
+typedef struct {
+ int_mv mv[2];
+ MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
typedef struct {
int ref_count;
+ MV_REF *mvs;
+ int mi_rows;
+ int mi_cols;
vpx_codec_frame_buffer_t raw_frame_buffer;
YV12_BUFFER_CONFIG buf;
+
+ // The Following variables will only be used in frame parallel decode.
+
+ // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+ // that no FrameWorker owns, or is decoding, this buffer.
+ VP9Worker *frame_worker_owner;
+
+ // row and col indicate which position frame has been decoded to in real
+ // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+ // when the frame is fully decoded.
+ int row;
+ int col;
} RefCntBuffer;
-typedef struct VP9Common {
- struct vpx_internal_error_info error;
+typedef struct BufferPool {
+ // Protect BufferPool from being accessed by several FrameWorkers at
+ // the same time during frame parallel decode.
+ // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t pool_mutex;
+#endif
+
+ // Private data associated with the frame buffer callbacks.
+ void *cb_priv;
- DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
- DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+ vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+ vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+ RefCntBuffer frame_bufs[FRAME_BUFFERS];
- COLOR_SPACE color_space;
+ // Frame buffers allocated internally by the codec.
+ InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+typedef struct VP9Common {
+ struct vpx_internal_error_info error;
+ vpx_color_space_t color_space;
int width;
int height;
int display_width;
@@ -89,11 +129,17 @@ typedef struct VP9Common {
#endif
YV12_BUFFER_CONFIG *frame_to_show;
+ RefCntBuffer *prev_frame;
- RefCntBuffer frame_bufs[FRAME_BUFFERS];
+ // TODO(hkuang): Combine this with cur_buf in macroblockd.
+ RefCntBuffer *cur_frame;
int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+ // Prepare ref_frame_map for the next frame.
+ // Only used in frame parallel decode.
+ int next_ref_frame_map[REF_FRAMES];
+
// TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
// roll new_fb_idx into it.
@@ -102,7 +148,10 @@ typedef struct VP9Common {
int new_fb_idx;
+#if CONFIG_VP9_POSTPROC
YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
FRAME_TYPE frame_type;
@@ -135,22 +184,42 @@ typedef struct VP9Common {
int y_dc_delta_q;
int uv_dc_delta_q;
int uv_ac_delta_q;
+ int16_t y_dequant[MAX_SEGMENTS][2];
+ int16_t uv_dequant[MAX_SEGMENTS][2];
/* We allocate a MODE_INFO struct for each macroblock, together with
an extra row on top and column on the left to simplify prediction. */
-
- int mi_idx;
- int prev_mi_idx;
int mi_alloc_size;
- MODE_INFO *mip_array[2];
-
MODE_INFO *mip; /* Base of allocated array */
MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
+
+ // TODO(agrange): Move prev_mi into encoder structure.
+ // prev_mip and prev_mi will only be allocated in VP9 encoder.
MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
+ // Separate mi functions between encoder and decoder.
+ int (*alloc_mi)(struct VP9Common *cm, int mi_size);
+ void (*free_mi)(struct VP9Common *cm);
+ void (*setup_mi)(struct VP9Common *cm);
+
+ // Grid of pointers to 8x8 MODE_INFO structs. Any 8x8 not in the visible
+ // area will be NULL.
+ MODE_INFO **mi_grid_base;
+ MODE_INFO **mi_grid_visible;
+ MODE_INFO **prev_mi_grid_base;
+ MODE_INFO **prev_mi_grid_visible;
+
+ // Whether to use previous frame's motion vectors for prediction.
+ int use_prev_frame_mvs;
+
// Persistent mb segment id map used in prediction.
- unsigned char *last_frame_seg_map;
+ int seg_map_idx;
+ int prev_seg_map_idx;
+
+ uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+ uint8_t *last_frame_seg_map;
+ uint8_t *current_frame_seg_map;
INTERP_FILTER interp_filter;
@@ -163,14 +232,17 @@ typedef struct VP9Common {
struct loopfilter lf;
struct segmentation seg;
+ // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+ // in pbi.
+ int frame_parallel_decode; // frame-based threading.
+
// Context probabilities for reference frame prediction
- int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
MV_REFERENCE_FRAME comp_var_ref[2];
REFERENCE_MODE reference_mode;
- FRAME_CONTEXT fc; /* this frame entropy */
- FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS];
+ FRAME_CONTEXT *fc; /* this frame entropy */
+ FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS
unsigned int frame_context_idx; /* Context to use/update */
FRAME_COUNTS counts;
@@ -189,6 +261,7 @@ typedef struct VP9Common {
int frame_parallel_decoding_mode;
int log2_tile_cols, log2_tile_rows;
+ int byte_alignment;
// Private data associated with the frame buffer callbacks.
void *cb_priv;
@@ -198,31 +271,43 @@ typedef struct VP9Common {
// Handles memory for the codec.
InternalFrameBufferList int_frame_buffers;
+ // External BufferPool passed from outside.
+ BufferPool *buffer_pool;
+
PARTITION_CONTEXT *above_seg_context;
ENTROPY_CONTEXT *above_context;
} VP9_COMMON;
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
if (index < 0 || index >= REF_FRAMES)
return NULL;
if (cm->ref_frame_map[index] < 0)
return NULL;
assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
- return &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+ return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
}
static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
- return &cm->frame_bufs[cm->new_fb_idx].buf;
+ return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
}
static INLINE int get_free_fb(VP9_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
int i;
- for (i = 0; i < FRAME_BUFFERS; i++)
- if (cm->frame_bufs[i].ref_count == 0)
+
+ lock_buffer_pool(cm->buffer_pool);
+ for (i = 0; i < FRAME_BUFFERS; ++i)
+ if (frame_bufs[i].ref_count == 0)
break;
assert(i < FRAME_BUFFERS);
- cm->frame_bufs[i].ref_count = 1;
+ frame_bufs[i].ref_count = 1;
+ unlock_buffer_pool(cm->buffer_pool);
return i;
}
@@ -245,13 +330,14 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- xd->plane[i].dqcoeff = xd->dqcoeff[i];
+ xd->plane[i].dqcoeff = xd->dqcoeff;
xd->above_context[i] = cm->above_context +
i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
}
xd->above_seg_context = cm->above_seg_context;
xd->mi_stride = cm->mi_stride;
+ xd->error_info = &cm->error;
}
static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
@@ -261,7 +347,7 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
int ctx) {
return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
- : cm->fc.partition_prob[ctx];
+ : cm->fc->partition_prob[ctx];
}
static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -292,17 +378,23 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
// Are edges available for intra prediction?
xd->up_available = (mi_row != 0);
xd->left_available = (mi_col > tile->mi_col_start);
-}
+ if (xd->up_available) {
+ xd->above_mi = xd->mi[-xd->mi_stride];
+ // above_mi may be NULL in VP9 encoder's first pass.
+ xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+ } else {
+ xd->above_mi = NULL;
+ xd->above_mbmi = NULL;
+ }
-static INLINE void set_prev_mi(VP9_COMMON *cm) {
- const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
- cm->height == cm->last_height &&
- !cm->intra_only &&
- cm->last_show_frame;
- // Special case: set prev_mi to NULL when the previous mode info
- // context cannot be used.
- cm->prev_mi = use_prev_in_find_mv_refs ?
- cm->prev_mip + cm->mi_stride + 1 : NULL;
+ if (xd->left_available) {
+ xd->left_mi = xd->mi[-1];
+ // left_mi may be NULL in VP9 encoder's first pass.
+ xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+ } else {
+ xd->left_mi = NULL;
+ xd->left_mbmi = NULL;
+ }
}
static INLINE void update_partition_context(MACROBLOCKD *xd,
@@ -318,8 +410,8 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
// update the partition context at the end notes. set partition bits
// of block sizes larger than the current one to be one, and partition
// bits of smaller block sizes to be zero.
- vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
- vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
+ memset(above_ctx, partition_context_lookup[subsize].above, bs);
+ memset(left_ctx, partition_context_lookup[subsize].left, bs);
}
static INLINE int partition_plane_context(const MACROBLOCKD *xd,
@@ -327,21 +419,12 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd,
BLOCK_SIZE bsize) {
const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
-
const int bsl = mi_width_log2_lookup[bsize];
- const int bs = 1 << bsl;
- int above = 0, left = 0, i;
+ int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
assert(bsl >= 0);
- for (i = 0; i < bs; i++) {
- above |= above_ctx[i];
- left |= left_ctx[i];
- }
- above = (above & bs) > 0;
- left = (left & bs) > 0;
-
return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
index 575ffbc30ac..983a4744dd6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
@@ -79,6 +79,9 @@ const short vp9_rv[] = {
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
uint8_t *dst_ptr,
int src_pixels_per_line,
@@ -88,10 +91,7 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
int flimit) {
uint8_t const *p_src;
uint8_t *p_dst;
- int row;
- int col;
- int i;
- int v;
+ int row, col, i, v, kernel;
int pitch = src_pixels_per_line;
uint8_t d[8];
(void)dst_pixels_per_line;
@@ -102,8 +102,8 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
- int kernel = 4;
- int v = p_src[col];
+ kernel = 4;
+ v = p_src[col];
for (i = -2; i <= 2; i++) {
if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -125,7 +125,7 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
d[i] = p_src[i];
for (col = 0; col < cols; col++) {
- int kernel = 4;
+ kernel = 4;
v = p_src[col];
d[col & 7] = v;
@@ -165,10 +165,7 @@ void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
int flimit) {
uint16_t const *p_src;
uint16_t *p_dst;
- int row;
- int col;
- int i;
- int v;
+ int row, col, i, v, kernel;
int pitch = src_pixels_per_line;
uint16_t d[8];
@@ -178,8 +175,8 @@ void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
- int kernel = 4;
- int v = p_src[col];
+ kernel = 4;
+ v = p_src[col];
for (i = -2; i <= 2; i++) {
if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -202,7 +199,7 @@ void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
d[i] = p_src[i];
for (col = 0; col < cols; col++) {
- int kernel = 4;
+ kernel = 4;
v = p_src[col];
d[col & 7] = v;
@@ -515,22 +512,24 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
(dst->flags & YV12_FLAG_HIGHBITDEPTH));
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
- + 2);
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
- vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
- src_height, src_width, ppl);
+ const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+ srcs[i] + 2 * src_stride + 2);
+ uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+ dsts[i] + 2 * dst_stride + 2);
+ vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+ dst_stride, src_height, src_width,
+ ppl);
} else {
- const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
- uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+ const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+ uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
- vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
- src_height, src_width, ppl);
+ vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+ dst_stride, src_height, src_width, ppl);
}
#else
- const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
- uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
- vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+ const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+ uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+ vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
src_height, src_width, ppl);
#endif
}
@@ -555,16 +554,15 @@ static void fillrd(struct postproc_state *state, int q, int a) {
* a gaussian distribution with sigma determined by q.
*/
{
- double i;
int next, j;
next = 0;
for (i = -32; i < 32; i++) {
- int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+ int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
- if (a) {
- for (j = 0; j < a; j++) {
+ if (a_i) {
+ for (j = 0; j < a_i; j++) {
char_dist[next + j] = (char) i;
}
@@ -616,9 +614,20 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise,
}
}
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MODE_INFO *temp = cm->postproc_state.prev_mip;
+ cm->postproc_state.prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
- const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+ const int q = MIN(105, cm->lf.filter_level * 2);
const int flags = ppflags->post_proc_flag;
YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
struct postproc_state *const ppstate = &cm->postproc_state;
@@ -633,18 +642,74 @@ int vp9_post_proc_frame(struct VP9Common *cm,
vp9_clear_system_state();
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
+ // Alloc memory for prev_mip in the first frame.
+ if (cm->current_video_frame == 1) {
+ cm->postproc_state.last_base_qindex = cm->base_qindex;
+ cm->postproc_state.last_frame_valid = 1;
+ ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+ if (!ppstate->prev_mip) {
+ return 1;
+ }
+ ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+ memset(ppstate->prev_mip, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ }
+
+ // Allocate post_proc_buffer_int if needed.
+ if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+ if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+ const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+ const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+ if (vp9_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment) < 0) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate MFQE framebuffer");
+ }
+
+ // Ensure that postproc is set to all 0s so that post proc
+ // doesn't pull random data in from edge.
+ memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+ cm->post_proc_buffer.frame_size);
+ }
+ }
+
if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+ VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL) < 0)
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate post-processing buffer");
-#endif
- if (flags & VP9D_DEMACROBLOCK) {
+ if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+ cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
+ cm->postproc_state.last_base_qindex <= last_q_thresh &&
+ cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+ vp9_mfqe(cm);
+ // TODO(jackychen): Consider whether enable deblocking by default
+ // if mfqe is enabled. Need to take both the quality and the speed
+ // into consideration.
+ if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+ vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+ }
+ if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+ deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+ q + (ppflags->deblocking_level - 5) * 10,
+ 1, 0);
+ } else if (flags & VP9D_DEBLOCK) {
+ vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+ } else {
+ vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+ }
+ } else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
q + (ppflags->deblocking_level - 5) * 10, 1, 0);
} else if (flags & VP9D_DEBLOCK) {
@@ -653,6 +718,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
+ cm->postproc_state.last_base_qindex = cm->base_qindex;
+ cm->postproc_state.last_frame_valid = 1;
+
if (flags & VP9D_ADDNOISE) {
const int noise_level = ppflags->noise_level;
if (ppstate->last_q != q ||
@@ -673,6 +741,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
dest->uv_width = dest->y_width >> cm->subsampling_x;
dest->uv_height = dest->y_height >> cm->subsampling_y;
+ swap_mi_and_prev_mi(cm);
return 0;
}
-#endif
+#endif // CONFIG_VP9_POSTPROC
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
index ebebc1ae346..035c9cdf846 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
@@ -14,6 +14,8 @@
#include "vpx_ports/mem.h"
#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
#include "vp9/common/vp9_ppflags.h"
#ifdef __cplusplus
@@ -24,6 +26,10 @@ struct postproc_state {
int last_q;
int last_noise;
char noise[3072];
+ int last_base_qindex;
+ int last_frame_valid;
+ MODE_INFO *prev_mip;
+ MODE_INFO *prev_mi;
DECLARE_ALIGNED(16, char, blackclamp[16]);
DECLARE_ALIGNED(16, char, whiteclamp[16]);
DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -31,6 +37,8 @@ struct postproc_state {
struct VP9Common;
+#define MFQE_PRECISION 4
+
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
index 1644a1bbbe8..12b989f43a5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
@@ -26,7 +26,8 @@ enum {
VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
VP9D_DEBUG_DRAW_MV = 1 << 7,
VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
- VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+ VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+ VP9D_MFQE = 1 << 10
};
typedef struct {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
index 901a043f69b..0aac4a9e677 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
@@ -15,21 +15,17 @@
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_seg_common.h"
-static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) {
- return (mi != NULL) ? &mi->mbmi : NULL;
-}
-
// Returns a context number for the given MB prediction signal
int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
// The prediction flags in these dummy entries are initialised to 0.
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ?
- left_mbmi->interp_filter : SWITCHABLE_FILTERS;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ?
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
+ left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
above_mbmi->interp_filter : SWITCHABLE_FILTERS;
if (left_type == above_type)
@@ -50,10 +46,10 @@ int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
// 2 - intra/--, --/intra
// 3 - intra/intra
int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
if (has_above && has_left) { // both edges available
const int above_intra = !is_inter_block(above_mbmi);
@@ -70,10 +66,10 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
int vp9_get_reference_mode_context(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
int ctx;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -113,10 +109,10 @@ int vp9_get_reference_mode_context(const VP9_COMMON *cm,
int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
int pred_context;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int above_in_image = above_mbmi != NULL;
- const int left_in_image = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int above_in_image = xd->up_available;
+ const int left_in_image = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
@@ -194,10 +190,10 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
int pred_context;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -260,10 +256,10 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
int pred_context;
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
// Note:
// The mode info data structure has a one element border above and to the
@@ -348,11 +344,11 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
// left of the entries corresponding to real blocks.
// The prediction flags in these dummy entries are initialized to 0.
int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
- const int max_tx_size = max_txsize_lookup[xd->mi[0].src_mi->mbmi.sb_type];
- const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
- const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
- const int has_above = above_mbmi != NULL;
- const int has_left = left_mbmi != NULL;
+ const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
: max_tx_size;
int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
index 39774f14285..bc19d28b906 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
@@ -18,20 +18,12 @@
extern "C" {
#endif
-static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
- return xd->up_available ? xd->mi[-xd->mi_stride].src_mi : NULL;
-}
-
-static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
- return xd->left_available ? xd->mi[-1].src_mi : NULL;
-}
-
int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
BLOCK_SIZE bsize, int mi_row, int mi_col);
static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
- const MODE_INFO *const above_mi = get_above_mi(xd);
- const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
const int above_sip = (above_mi != NULL) ?
above_mi->mbmi.seg_id_predicted : 0;
const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
@@ -45,8 +37,8 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
}
static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
- const MODE_INFO *const above_mi = get_above_mi(xd);
- const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
return above_skip + left_skip;
@@ -54,7 +46,7 @@ static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
- return cm->fc.skip_probs[vp9_get_skip_context(xd)];
+ return cm->fc->skip_probs[vp9_get_skip_context(xd)];
}
int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
@@ -63,14 +55,14 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
- return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)];
+ return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
}
int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
- return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
+ return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
}
int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
@@ -79,21 +71,21 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
- return cm->fc.comp_ref_prob[pred_context];
+ return cm->fc->comp_ref_prob[pred_context];
}
int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
- return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
+ return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
}
int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
- return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
+ return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
}
int vp9_get_tx_size_context(const MACROBLOCKD *xd);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
index a1befc63e88..3b7b9bf3b39 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
@@ -29,33 +29,25 @@ const uint8_t vp9_norm[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-
static unsigned int tree_merge_probs_impl(unsigned int i,
const vp9_tree_index *tree,
const vp9_prob *pre_probs,
const unsigned int *counts,
- unsigned int count_sat,
- unsigned int max_update,
vp9_prob *probs) {
const int l = tree[i];
const unsigned int left_count = (l <= 0)
? counts[-l]
- : tree_merge_probs_impl(l, tree, pre_probs, counts,
- count_sat, max_update, probs);
+ : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
const int r = tree[i + 1];
const unsigned int right_count = (r <= 0)
? counts[-r]
- : tree_merge_probs_impl(r, tree, pre_probs, counts,
- count_sat, max_update, probs);
+ : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
const unsigned int ct[2] = { left_count, right_count };
- probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
- count_sat, max_update);
+ probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
return left_count + right_count;
}
void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
- const unsigned int *counts, unsigned int count_sat,
- unsigned int max_update_factor, vp9_prob *probs) {
- tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
- max_update_factor, probs);
+ const unsigned int *counts, vp9_prob *probs) {
+ tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
index bc1511a5e18..c69c62c81f8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
@@ -33,6 +33,8 @@ typedef int8_t vp9_tree_index;
#define vp9_complement(x) (255 - x)
+#define MODE_MV_COUNT_SAT 20
+
/* We build coding trees compactly in arrays.
Each node of the tree is a pair of vp9_tree_indices.
Array index often references a corresponding probability table.
@@ -69,9 +71,28 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
return weighted_prob(pre_prob, prob, factor);
}
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+ 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+ 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob,
+ const unsigned int ct[2]) {
+ const unsigned int den = ct[0] + ct[1];
+ if (den == 0) {
+ return pre_prob;
+ } else {
+ const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
+ const unsigned int factor = count_to_update_factor[count];
+ const vp9_prob prob =
+ clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
+ return weighted_prob(pre_prob, prob, factor);
+ }
+}
+
void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
- const unsigned int *counts, unsigned int count_sat,
- unsigned int max_update_factor, vp9_prob *probs);
+ const unsigned int *counts, vp9_prob *probs);
DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
index 3492a23d01c..11eaf2e2d70 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
@@ -20,97 +20,7 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
-static void build_mc_border(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- int x, int y, int b_w, int b_h, int w, int h) {
- // Get a pointer to the start of the real data for this row.
- const uint8_t *ref_row = src - x - y * src_stride;
-
- if (y >= h)
- ref_row += (h - 1) * src_stride;
- else if (y > 0)
- ref_row += y * src_stride;
-
- do {
- int right = 0, copy;
- int left = x < 0 ? -x : 0;
-
- if (left > b_w)
- left = b_w;
-
- if (x + b_w > w)
- right = x + b_w - w;
-
- if (right > b_w)
- right = b_w;
-
- copy = b_w - left - right;
-
- if (left)
- memset(dst, ref_row[0], left);
-
- if (copy)
- memcpy(dst + left, ref_row + x + left, copy);
-
- if (right)
- memset(dst + left + copy, ref_row[w - 1], right);
-
- dst += dst_stride;
- ++y;
-
- if (y > 0 && y < h)
- ref_row += src_stride;
- } while (--b_h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void high_build_mc_border(const uint8_t *src8, int src_stride,
- uint16_t *dst, int dst_stride,
- int x, int y, int b_w, int b_h,
- int w, int h) {
- // Get a pointer to the start of the real data for this row.
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *ref_row = src - x - y * src_stride;
-
- if (y >= h)
- ref_row += (h - 1) * src_stride;
- else if (y > 0)
- ref_row += y * src_stride;
-
- do {
- int right = 0, copy;
- int left = x < 0 ? -x : 0;
-
- if (left > b_w)
- left = b_w;
-
- if (x + b_w > w)
- right = x + b_w - w;
-
- if (right > b_w)
- right = b_w;
-
- copy = b_w - left - right;
-
- if (left)
- vpx_memset16(dst, ref_row[0], left);
-
- if (copy)
- memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
-
- if (right)
- vpx_memset16(dst + left + copy, ref_row[w - 1], right);
-
- dst += dst_stride;
- ++y;
-
- if (y > 0 && y < h)
- ref_row += src_stride;
- } while (--b_h);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-static void inter_predictor(const uint8_t *src, int src_stride,
+void inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int subpel_x,
const int subpel_y,
@@ -123,29 +33,8 @@ static void inter_predictor(const uint8_t *src, int src_stride,
kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
}
-void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const MV *src_mv,
- const struct scale_factors *sf,
- int w, int h, int ref,
- const InterpKernel *kernel,
- enum mv_precision precision,
- int x, int y) {
- const int is_q4 = precision == MV_PRECISION_Q4;
- const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
- is_q4 ? src_mv->col : src_mv->col * 2 };
- MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
- const int subpel_x = mv.col & SUBPEL_MASK;
- const int subpel_y = mv.row & SUBPEL_MASK;
-
- src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
-
- inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
- sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
-}
-
#if CONFIG_VP9_HIGHBITDEPTH
-static void high_inter_predictor(const uint8_t *src, int src_stride,
+void high_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int subpel_x,
const int subpel_y,
@@ -180,6 +69,27 @@ void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const MV *src_mv,
+ const struct scale_factors *sf,
+ int w, int h, int ref,
+ const InterpKernel *kernel,
+ enum mv_precision precision,
+ int x, int y) {
+ const int is_q4 = precision == MV_PRECISION_Q4;
+ const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+ is_q4 ? src_mv->col : src_mv->col * 2 };
+ MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+ const int subpel_x = mv.col & SUBPEL_MASK;
+ const int subpel_y = mv.row & SUBPEL_MASK;
+
+ src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+ inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+ sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
+}
+
static INLINE int round_mv_comp_q4(int value) {
return (value < 0 ? value - 2 : value + 2) / 4;
}
@@ -234,8 +144,8 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
return clamped_mv;
}
-static MV average_split_mvs(const struct macroblockd_plane *pd,
- const MODE_INFO *mi, int ref, int block) {
+MV average_split_mvs(const struct macroblockd_plane *pd,
+ const MODE_INFO *mi, int ref, int block) {
const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
MV res = {0, 0};
switch (ss_idx) {
@@ -252,17 +162,17 @@ static MV average_split_mvs(const struct macroblockd_plane *pd,
res = mi_mv_pred_q4(mi, ref);
break;
default:
- assert(ss_idx <= 3 || ss_idx >= 0);
+ assert(ss_idx <= 3 && ss_idx >= 0);
}
return res;
}
-static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
int bw, int bh,
int x, int y, int w, int h,
int mi_x, int mi_y) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- const MODE_INFO *mi = xd->mi[0].src_mi;
+ const MODE_INFO *mi = xd->mi[0];
const int is_compound = has_second_ref(&mi->mbmi);
const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
int ref;
@@ -336,7 +246,7 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
const int bw = 4 * num_4x4_w;
const int bh = 4 * num_4x4_h;
- if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
+ if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
int i = 0, x, y;
assert(bsize == BLOCK_8X8);
for (y = 0; y < num_4x4_h; ++y)
@@ -354,231 +264,31 @@ void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize) {
build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
}
+
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int plane) {
+ build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+}
+
void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize) {
build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
MAX_MB_PLANE - 1);
}
+
void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize) {
build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
MAX_MB_PLANE - 1);
}
-// TODO(jingning): This function serves as a placeholder for decoder prediction
-// using on demand border extension. It should be moved to /decoder/ directory.
-static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
- int bw, int bh,
- int x, int y, int w, int h,
- int mi_x, int mi_y) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const MODE_INFO *mi = xd->mi[0].src_mi;
- const int is_compound = has_second_ref(&mi->mbmi);
- const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
- int ref;
-
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = &pd->pre[ref];
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
- const MV mv = mi->mbmi.sb_type < BLOCK_8X8
- ? average_split_mvs(pd, mi, ref, block)
- : mi->mbmi.mv[ref].as_mv;
-
- const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
- pd->subsampling_x,
- pd->subsampling_y);
-
- MV32 scaled_mv;
- int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
- subpel_x, subpel_y;
- uint8_t *ref_frame, *buf_ptr;
- const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
- const int is_scaled = vp9_is_scaled(sf);
-
- // Get reference frame pointer, width and height.
- if (plane == 0) {
- frame_width = ref_buf->y_crop_width;
- frame_height = ref_buf->y_crop_height;
- ref_frame = ref_buf->y_buffer;
- } else {
- frame_width = ref_buf->uv_crop_width;
- frame_height = ref_buf->uv_crop_height;
- ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
- }
-
- if (is_scaled) {
- // Co-ordinate of containing block to pixel precision.
- int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
- int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
- // Co-ordinate of the block to 1/16th pixel precision.
- x0_16 = (x_start + x) << SUBPEL_BITS;
- y0_16 = (y_start + y) << SUBPEL_BITS;
-
- // Co-ordinate of current block in reference frame
- // to 1/16th pixel precision.
- x0_16 = sf->scale_value_x(x0_16, sf);
- y0_16 = sf->scale_value_y(y0_16, sf);
-
- // Map the top left corner of the block into the reference frame.
- x0 = sf->scale_value_x(x_start + x, sf);
- y0 = sf->scale_value_y(y_start + y, sf);
-
- // Scale the MV and incorporate the sub-pixel offset of the block
- // in the reference frame.
- scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
- xs = sf->x_step_q4;
- ys = sf->y_step_q4;
- } else {
- // Co-ordinate of containing block to pixel precision.
- x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
- y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
- // Co-ordinate of the block to 1/16th pixel precision.
- x0_16 = x0 << SUBPEL_BITS;
- y0_16 = y0 << SUBPEL_BITS;
-
- scaled_mv.row = mv_q4.row;
- scaled_mv.col = mv_q4.col;
- xs = ys = 16;
- }
- subpel_x = scaled_mv.col & SUBPEL_MASK;
- subpel_y = scaled_mv.row & SUBPEL_MASK;
-
- // Calculate the top left corner of the best matching block in the
- // reference frame.
- x0 += scaled_mv.col >> SUBPEL_BITS;
- y0 += scaled_mv.row >> SUBPEL_BITS;
- x0_16 += scaled_mv.col;
- y0_16 += scaled_mv.row;
-
- // Get reference block pointer.
- buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
- buf_stride = pre_buf->stride;
-
- // Do border extension if there is motion or the
- // width/height is not a multiple of 8 pixels.
- if (is_scaled || scaled_mv.col || scaled_mv.row ||
- (frame_width & 0x7) || (frame_height & 0x7)) {
- // Get reference block bottom right coordinate.
- int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
- int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
- int x_pad = 0, y_pad = 0;
-
- if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
- x0 -= VP9_INTERP_EXTEND - 1;
- x1 += VP9_INTERP_EXTEND;
- x_pad = 1;
- }
-
- if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
- y0 -= VP9_INTERP_EXTEND - 1;
- y1 += VP9_INTERP_EXTEND;
- y_pad = 1;
- }
-
- // Skip border extension if block is inside the frame.
- if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
- y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
- uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
- // Extend the border.
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_build_mc_border(buf_ptr1,
- pre_buf->stride,
- xd->mc_buf_high,
- x1 - x0 + 1,
- x0,
- y0,
- x1 - x0 + 1,
- y1 - y0 + 1,
- frame_width,
- frame_height);
- buf_stride = x1 - x0 + 1;
- buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
- y_pad * 3 * buf_stride + x_pad * 3;
- } else {
- build_mc_border(buf_ptr1,
- pre_buf->stride,
- xd->mc_buf,
- x1 - x0 + 1,
- x0,
- y0,
- x1 - x0 + 1,
- y1 - y0 + 1,
- frame_width,
- frame_height);
- buf_stride = x1 - x0 + 1;
- buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
- }
-#else
- build_mc_border(buf_ptr1,
- pre_buf->stride,
- xd->mc_buf,
- x1 - x0 + 1,
- x0,
- y0,
- x1 - x0 + 1,
- y1 - y0 + 1,
- frame_width,
- frame_height);
- buf_stride = x1 - x0 + 1;
- buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-#endif // CONFIG_VP9_HIGHBITDEPTH
- }
- }
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
- } else {
- inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
- }
-#else
- inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif // CONFIG_VP9_HIGHBITDEPTH
- }
-}
-
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
- int plane;
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
- for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
- &xd->plane[plane]);
- const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
- const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
- const int bw = 4 * num_4x4_w;
- const int bh = 4 * num_4x4_h;
-
- if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
- int i = 0, x, y;
- assert(bsize == BLOCK_8X8);
- for (y = 0; y < num_4x4_h; ++y)
- for (x = 0; x < num_4x4_w; ++x)
- dec_build_inter_predictors(xd, plane, i++, bw, bh,
- 4 * x, 4 * y, 4, 4, mi_x, mi_y);
- } else {
- dec_build_inter_predictors(xd, plane, 0, bw, bh,
- 0, 0, bw, bh, mi_x, mi_y);
- }
- }
-}
-
void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col) {
- uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
+ uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+ src->v_buffer};
+ const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+ src->uv_stride};
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -594,11 +304,10 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
const struct scale_factors *sf) {
if (src != NULL) {
int i;
- uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
-
+ uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+ src->v_buffer};
+ const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+ src->uv_stride};
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblockd_plane *const pd = &xd->plane[i];
setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
index 3eaf07cf85f..e7057445a07 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
@@ -18,18 +18,49 @@
extern "C" {
#endif
+void inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int subpel_x,
+ const int subpel_y,
+ const struct scale_factors *sf,
+ int w, int h, int ref,
+ const InterpKernel *kernel,
+ int xs, int ys);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int subpel_x,
+ const int subpel_y,
+ const struct scale_factors *sf,
+ int w, int h, int ref,
+ const InterpKernel *kernel,
+ int xs, int ys, int bd);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
+ int ref, int block);
+
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+ int bw, int bh, int ss_x, int ss_y);
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+ int bw, int bh,
+ int x, int y, int w, int h,
+ int mi_x, int mi_y);
+
void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int plane);
+
void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE bsize);
-
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const MV *mv_q3,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
index 720bb445de5..825d03d69b2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
@@ -12,6 +12,7 @@
#include "./vp9_rtcd.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_onyxc_int.h"
@@ -29,6 +30,25 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
ADST_ADST, // TM
};
+enum {
+ NEED_LEFT = 1 << 1,
+ NEED_ABOVE = 1 << 2,
+ NEED_ABOVERIGHT = 1 << 3,
+};
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+ NEED_ABOVE | NEED_LEFT, // DC
+ NEED_ABOVE, // V
+ NEED_LEFT, // H
+ NEED_ABOVERIGHT, // D45
+ NEED_LEFT | NEED_ABOVE, // D135
+ NEED_LEFT | NEED_ABOVE, // D117
+ NEED_LEFT | NEED_ABOVE, // D153
+ NEED_LEFT, // D207
+ NEED_ABOVERIGHT, // D63
+ NEED_LEFT | NEED_ABOVE, // TM
+};
+
// This serves as a wrapper function, so that all the prediction functions
// can be unified and accessed as a pointer array. Note that the boundary
// above and left are not necessarily used all the time.
@@ -225,7 +245,7 @@ static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
(void) left;
(void) bd;
for (r = 0; r < bs; r++) {
- vpx_memcpy(dst, above, bs * sizeof(uint16_t));
+ memcpy(dst, above, bs * sizeof(uint16_t));
dst += stride;
}
}
@@ -468,7 +488,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
(void) left;
for (r = 0; r < bs; r++) {
- vpx_memcpy(dst, above, bs);
+ memcpy(dst, above, bs);
dst += stride;
}
}
@@ -480,7 +500,7 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
(void) above;
for (r = 0; r < bs; r++) {
- vpx_memset(dst, left[r], bs);
+ memset(dst, left[r], bs);
dst += stride;
}
}
@@ -506,7 +526,7 @@ static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
(void) left;
for (r = 0; r < bs; r++) {
- vpx_memset(dst, 128, bs);
+ memset(dst, 128, bs);
dst += stride;
}
}
@@ -523,7 +543,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) {
- vpx_memset(dst, expected_dc, bs);
+ memset(dst, expected_dc, bs);
dst += stride;
}
}
@@ -539,7 +559,7 @@ static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) {
- vpx_memset(dst, expected_dc, bs);
+ memset(dst, expected_dc, bs);
dst += stride;
}
}
@@ -558,7 +578,7 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
expected_dc = (sum + (count >> 1)) / count;
for (r = 0; r < bs; r++) {
- vpx_memset(dst, expected_dc, bs);
+ memset(dst, expected_dc, bs);
dst += stride;
}
}
@@ -579,7 +599,7 @@ static intra_high_pred_fn pred_high[INTRA_MODES][4];
static intra_high_pred_fn dc_pred_high[2][2][4];
#endif // CONFIG_VP9_HIGHBITDEPTH
-void vp9_init_intra_predictors() {
+static void vp9_init_intra_predictors_internal(void) {
#define INIT_ALL_SIZES(p, type) \
p[TX_4X4] = vp9_##type##_predictor_4x4; \
p[TX_8X8] = vp9_##type##_predictor_8x8; \
@@ -637,8 +657,8 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
int i;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16);
+ DECLARE_ALIGNED(16, uint16_t, left_col[32]);
+ DECLARE_ALIGNED(16, uint16_t, above_data[128 + 16]);
uint16_t *above_row = above_data + 16;
const uint16_t *const_above_row = above_row;
const int bs = 4 << tx_size;
@@ -698,32 +718,26 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
/* slower path if the block needs border extension */
if (x0 + 2 * bs <= frame_width) {
if (right_available && bs == 4) {
- vpx_memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t));
+ memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t));
} else {
- vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+ memcpy(above_row, above_ref, bs * sizeof(uint16_t));
vpx_memset16(above_row + bs, above_row[bs - 1], bs);
}
} else if (x0 + bs <= frame_width) {
const int r = frame_width - x0;
if (right_available && bs == 4) {
- vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
+ memcpy(above_row, above_ref, r * sizeof(uint16_t));
vpx_memset16(above_row + r, above_row[r - 1],
x0 + 2 * bs - frame_width);
} else {
- vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+ memcpy(above_row, above_ref, bs * sizeof(uint16_t));
vpx_memset16(above_row + bs, above_row[bs - 1], bs);
}
} else if (x0 <= frame_width) {
const int r = frame_width - x0;
- if (right_available && bs == 4) {
- vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
- vpx_memset16(above_row + r, above_row[r - 1],
- x0 + 2 * bs - frame_width);
- } else {
- vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
- vpx_memset16(above_row + r, above_row[r - 1],
+ memcpy(above_row, above_ref, r * sizeof(uint16_t));
+ vpx_memset16(above_row + r, above_row[r - 1],
x0 + 2 * bs - frame_width);
- }
}
// TODO(Peter) this value should probably change for high bitdepth
above_row[-1] = left_available ? above_ref[-1] : (base+1);
@@ -732,9 +746,9 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
if (bs == 4 && right_available && left_available) {
const_above_row = above_ref;
} else {
- vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+ memcpy(above_row, above_ref, bs * sizeof(uint16_t));
if (bs == 4 && right_available)
- vpx_memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t));
+ memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t));
else
vpx_memset16(above_row + bs, above_row[bs - 1], bs);
// TODO(Peter): this value should probably change for high bitdepth
@@ -766,8 +780,8 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
int right_available, int x, int y,
int plane) {
int i;
- DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
- DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+ DECLARE_ALIGNED(16, uint8_t, left_col[32]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[128 + 16]);
uint8_t *above_row = above_data + 16;
const uint8_t *const_above_row = above_row;
const int bs = 4 << tx_size;
@@ -795,81 +809,103 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
- vpx_memset(left_col, 129, 64);
-
- // left
- if (left_available) {
- if (xd->mb_to_bottom_edge < 0) {
- /* slower path if the block needs border extension */
- if (y0 + bs <= frame_height) {
- for (i = 0; i < bs; ++i)
- left_col[i] = ref[i * ref_stride - 1];
+ // NEED_LEFT
+ if (extend_modes[mode] & NEED_LEFT) {
+ if (left_available) {
+ if (xd->mb_to_bottom_edge < 0) {
+ /* slower path if the block needs border extension */
+ if (y0 + bs <= frame_height) {
+ for (i = 0; i < bs; ++i)
+ left_col[i] = ref[i * ref_stride - 1];
+ } else {
+ const int extend_bottom = frame_height - y0;
+ for (i = 0; i < extend_bottom; ++i)
+ left_col[i] = ref[i * ref_stride - 1];
+ for (; i < bs; ++i)
+ left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+ }
} else {
- const int extend_bottom = frame_height - y0;
- for (i = 0; i < extend_bottom; ++i)
+ /* faster path if the block does not need extension */
+ for (i = 0; i < bs; ++i)
left_col[i] = ref[i * ref_stride - 1];
- for (; i < bs; ++i)
- left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
}
} else {
- /* faster path if the block does not need extension */
- for (i = 0; i < bs; ++i)
- left_col[i] = ref[i * ref_stride - 1];
+ memset(left_col, 129, bs);
}
}
- // TODO(hkuang) do not extend 2*bs pixels for all modes.
- // above
- if (up_available) {
- const uint8_t *above_ref = ref - ref_stride;
- if (xd->mb_to_right_edge < 0) {
- /* slower path if the block needs border extension */
- if (x0 + 2 * bs <= frame_width) {
- if (right_available && bs == 4) {
- vpx_memcpy(above_row, above_ref, 2 * bs);
- } else {
- vpx_memcpy(above_row, above_ref, bs);
- vpx_memset(above_row + bs, above_row[bs - 1], bs);
- }
- } else if (x0 + bs <= frame_width) {
- const int r = frame_width - x0;
- if (right_available && bs == 4) {
- vpx_memcpy(above_row, above_ref, r);
- vpx_memset(above_row + r, above_row[r - 1],
- x0 + 2 * bs - frame_width);
- } else {
- vpx_memcpy(above_row, above_ref, bs);
- vpx_memset(above_row + bs, above_row[bs - 1], bs);
+ // NEED_ABOVE
+ if (extend_modes[mode] & NEED_ABOVE) {
+ if (up_available) {
+ const uint8_t *above_ref = ref - ref_stride;
+ if (xd->mb_to_right_edge < 0) {
+ /* slower path if the block needs border extension */
+ if (x0 + bs <= frame_width) {
+ memcpy(above_row, above_ref, bs);
+ } else if (x0 <= frame_width) {
+ const int r = frame_width - x0;
+ memcpy(above_row, above_ref, r);
+ memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
}
- } else if (x0 <= frame_width) {
- const int r = frame_width - x0;
- if (right_available && bs == 4) {
- vpx_memcpy(above_row, above_ref, r);
- vpx_memset(above_row + r, above_row[r - 1],
- x0 + 2 * bs - frame_width);
+ } else {
+ /* faster path if the block does not need extension */
+ if (bs == 4 && right_available && left_available) {
+ const_above_row = above_ref;
} else {
- vpx_memcpy(above_row, above_ref, r);
- vpx_memset(above_row + r, above_row[r - 1],
- x0 + 2 * bs - frame_width);
+ memcpy(above_row, above_ref, bs);
}
}
above_row[-1] = left_available ? above_ref[-1] : 129;
} else {
- /* faster path if the block does not need extension */
- if (bs == 4 && right_available && left_available) {
- const_above_row = above_ref;
+ memset(above_row, 127, bs);
+ above_row[-1] = 127;
+ }
+ }
+
+ // NEED_ABOVERIGHT
+ if (extend_modes[mode] & NEED_ABOVERIGHT) {
+ if (up_available) {
+ const uint8_t *above_ref = ref - ref_stride;
+ if (xd->mb_to_right_edge < 0) {
+ /* slower path if the block needs border extension */
+ if (x0 + 2 * bs <= frame_width) {
+ if (right_available && bs == 4) {
+ memcpy(above_row, above_ref, 2 * bs);
+ } else {
+ memcpy(above_row, above_ref, bs);
+ memset(above_row + bs, above_row[bs - 1], bs);
+ }
+ } else if (x0 + bs <= frame_width) {
+ const int r = frame_width - x0;
+ if (right_available && bs == 4) {
+ memcpy(above_row, above_ref, r);
+ memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+ } else {
+ memcpy(above_row, above_ref, bs);
+ memset(above_row + bs, above_row[bs - 1], bs);
+ }
+ } else if (x0 <= frame_width) {
+ const int r = frame_width - x0;
+ memcpy(above_row, above_ref, r);
+ memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+ }
} else {
- vpx_memcpy(above_row, above_ref, bs);
- if (bs == 4 && right_available)
- vpx_memcpy(above_row + bs, above_ref + bs, bs);
- else
- vpx_memset(above_row + bs, above_row[bs - 1], bs);
- above_row[-1] = left_available ? above_ref[-1] : 129;
+ /* faster path if the block does not need extension */
+ if (bs == 4 && right_available && left_available) {
+ const_above_row = above_ref;
+ } else {
+ memcpy(above_row, above_ref, bs);
+ if (bs == 4 && right_available)
+ memcpy(above_row + bs, above_ref + bs, bs);
+ else
+ memset(above_row + bs, above_row[bs - 1], bs);
+ }
}
+ above_row[-1] = left_available ? above_ref[-1] : 129;
+ } else {
+ memset(above_row, 127, bs * 2);
+ above_row[-1] = 127;
}
- } else {
- vpx_memset(above_row, 127, bs * 2);
- above_row[-1] = 127;
}
// predict
@@ -906,3 +942,7 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
have_top, have_left, have_right, x, y, plane);
}
+
+void vp9_init_intra_predictors() {
+ once(vp9_init_intra_predictors_internal);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c
index dc15a84ff17..2dfa09f50e0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c
@@ -12,9 +12,8 @@
#include "./vp9_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vpx_scale_rtcd(void);
-
void vp9_rtcd() {
- vpx_scale_rtcd();
+ // TODO(JBB): Remove this once, by insuring that both the encoder and
+ // decoder setup functions are protected by once();
once(setup_rtcd_internal);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index 0530f3a303e..d05afa525c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -66,8 +66,7 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con
specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
+specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_4x4/;
@@ -79,24 +78,22 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
+specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
-$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
+specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4/;
+specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4/;
+specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4/;
+specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -108,8 +105,7 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con
specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
+specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_8x8/;
@@ -121,24 +117,22 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
+specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
-$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
+specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8/;
+specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8/;
+specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8/;
+specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -150,8 +144,7 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c
specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
+specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_16x16/;
@@ -163,24 +156,22 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
+specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
+specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16/;
+specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16/;
+specialize qw/vp9_dc_left_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16/;
+specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
@@ -192,8 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c
specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
-$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
+specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d117_predictor_32x32/;
@@ -205,24 +195,22 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vp9_d153_predictor_32x32/;
add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
+specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
-$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
+specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32/;
+specialize qw/vp9_dc_top_predictor_32x32/, "$sse2_x86inc";
add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32/;
+specialize qw/vp9_dc_left_predictor_32x32/, "$sse2_x86inc";
add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32/;
+specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc";
#
# Loopfilter
@@ -244,12 +232,10 @@ specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/;
-$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon;
+specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon;
+specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
@@ -264,12 +250,10 @@ specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/;
-$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon;
+specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/;
#
# post proc
@@ -290,42 +274,40 @@ $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
specialize qw/vp9_plane_add_noise sse2/;
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+
+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight16x16 sse2/;
+
+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight8x8 sse2/;
}
#
# Sub Pixel Filters
#
add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon;
+specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
+specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_neon_asm=vp9_convolve8_neon;
+specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon;
+specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;
add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon;
+specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/;
add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
+specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/;
#
# dct
@@ -437,59 +419,48 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_iwht4x4_16_add/;
} else {
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
- $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+ specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
- $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+ specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
- $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
- $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+ specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
- $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+ specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
- $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+ specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
- $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+ specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
- $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+ specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
- $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+ specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+ specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;
+ #is this a typo?
$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
- $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+ specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
- $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+ specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
- $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+ specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+ specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
# dct and add
@@ -528,7 +499,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_d153_predictor_4x4/;
add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vp9_highbd_v_predictor_4x4 neon/, "$sse_x86inc";
+ specialize qw/vp9_highbd_v_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc";
@@ -606,7 +577,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_d153_predictor_16x16/;
add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vp9_highbd_v_predictor_16x16 neon/, "$sse2_x86inc";
+ specialize qw/vp9_highbd_v_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64";
@@ -750,27 +721,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_1_add/;
- add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vp9_highbd_idct4x4_16_add/;
-
add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_1_add/;
- add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vp9_highbd_idct8x8_64_add/;
-
- add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vp9_highbd_idct8x8_10_add/;
-
add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_1_add/;
- add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vp9_highbd_idct16x16_256_add/;
-
- add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vp9_highbd_idct16x16_10_add/;
-
add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_1024_add/;
@@ -796,6 +752,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_iwht4x4_16_add/;
+
+ # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+
+ add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct4x4_16_add/;
+
+ add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct8x8_64_add/;
+
+ add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct8x8_10_add/;
+
+ add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct16x16_256_add/;
+
+ add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct16x16_10_add/;
+
+ } else {
+
+ add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct4x4_16_add sse2/;
+
+ add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct8x8_64_add sse2/;
+
+ add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct8x8_10_add sse2/;
+
+ add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct16x16_256_add sse2/;
+
+ add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vp9_highbd_idct16x16_10_add sse2/;
+ }
}
#
@@ -812,16 +804,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc
specialize qw/vp9_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
@@ -851,7 +843,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
specialize qw/vp9_variance4x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -930,172 +922,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad64x64 neon avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x64 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad64x32 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x32 neon avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad4x8/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad4x4/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x64_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x64_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x32_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x16_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x32_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
-
-add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x3/;
-
-add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x3/;
-
-add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x3 sse3 ssse3/;
-
-add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x8x3 sse3 ssse3/;
-
-add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x16x3 sse3/;
-
-add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x8x3 sse3/;
-
-add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x4x3 sse3/;
-
-add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad64x64x8/;
-
-add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad32x32x8/;
-
-add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x16x8 sse4/;
-
-add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x8x8 sse4/;
-
-add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x16x8 sse4/;
-
-add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x8x8 sse4/;
-
-add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x4x8/;
-
-add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x8x8/;
-
-add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x4x8 sse4/;
-
-add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
-
-add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x64x4d sse2/;
-
-add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x32x4d sse2/;
-
-add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x16x4d sse2/;
-
-add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x32x4d sse2/;
-
-add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
-
-add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x4d sse2/;
-
-add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x8x4d sse2/;
-
-add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x16x4d sse2/;
-
-add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x8x4d sse2/;
-
-# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x4x4d sse2/;
-
-add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x8x4d sse/;
-
-add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x4x4d sse/;
-
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
@@ -1112,11 +938,39 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2/;
+specialize qw/vp9_avg_8x8 sse2 neon/;
+
+add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
+specialize qw/vp9_avg_4x4 sse2/;
+
+add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+specialize qw/vp9_minmax_8x8 sse2/;
+
+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_16x16 sse2/;
+
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
+
+add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
+specialize qw/vp9_int_pro_row sse2/;
+
+add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
+specialize qw/vp9_int_pro_col sse2/;
+
+add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+specialize qw/vp9_vector_var sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_highbd_avg_8x8/;
+ add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/vp9_highbd_avg_4x4/;
+ add_proto qw/unsigned int vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/vp9_highbd_minmax_8x8/;
}
# ENCODEMB INVOKE
@@ -1138,32 +992,41 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error/;
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/;
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/;
- add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b/;
- add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b_32x32/;
+
+ add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_fdct8x8_quant/;
} else {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+ add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+ specialize qw/vp9_block_error_fp sse2/;
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
}
#
@@ -1181,43 +1044,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht4x4/;
+ specialize qw/vp9_fht4x4 sse2/;
add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht8x8/;
+ specialize qw/vp9_fht8x8 sse2/;
add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht16x16/;
+ specialize qw/vp9_fht16x16 sse2/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4/;
+ specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct4x4_1/;
+ specialize qw/vp9_fdct4x4_1 sse2/;
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct4x4/;
+ specialize qw/vp9_fdct4x4 sse2/;
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct8x8_1/;
+ specialize qw/vp9_fdct8x8_1 sse2/;
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct8x8/;
+ specialize qw/vp9_fdct8x8 sse2/;
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct16x16_1/;
+ specialize qw/vp9_fdct16x16_1 sse2/;
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct16x16/;
+ specialize qw/vp9_fdct16x16 sse2/;
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32_1/;
+ specialize qw/vp9_fdct32x32_1 sse2/;
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32/;
+ specialize qw/vp9_fdct32x32 sse2/;
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32_rd/;
+ specialize qw/vp9_fdct32x32_rd sse2/;
} else {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht4x4 sse2/;
@@ -1267,9 +1130,6 @@ specialize qw/vp9_full_search_sad sse3 sse4_1/;
$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
-add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_refining_search_sad/;
-
add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
specialize qw/vp9_diamond_search_sad/;
@@ -1283,34 +1143,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# variance
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x16/;
+ specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x32/;
+ specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x32/;
+ specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x64/;
+ specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x32/;
+ specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x64/;
+ specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x16/;
+ specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x8/;
+ specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x16/;
+ specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x8/;
+ specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x4/;
@@ -1322,40 +1182,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_variance4x4/;
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get8x8var/;
+ specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get16x16var/;
+ specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x16/;
+ specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x32/;
+ specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x32/;
+ specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x64/;
+ specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x32/;
+ specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x64/;
+ specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x16/;
+ specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x8/;
+ specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x16/;
+ specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x8/;
+ specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x4/;
@@ -1367,40 +1227,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_variance4x4/;
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get8x8var/;
+ specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get16x16var/;
+ specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x16/;
+ specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x32/;
+ specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x32/;
+ specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x64/;
+ specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x32/;
+ specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x64/;
+ specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x16/;
+ specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x8/;
+ specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x16/;
+ specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x8/;
+ specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x4/;
@@ -1412,76 +1272,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_variance4x4/;
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get8x8var/;
+ specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get16x16var/;
+ specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance64x64/;
+ specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x64/;
+ specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance64x32/;
+ specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x16/;
+ specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x32/;
+ specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x32/;
+ specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x16/;
+ specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x16/;
+ specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x8/;
+ specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x8/;
+ specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x4/;
+ specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
+ specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1496,70 +1356,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
+ specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
+ specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1574,70 +1434,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
+ specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
+ specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -1651,174 +1511,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
- add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad64x64/;
-
- add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad32x64/;
-
- add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad64x32/;
-
- add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad32x16/;
-
- add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad16x32/;
-
- add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad32x32/;
-
- add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad16x16/;
-
- add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad16x8/;
-
- add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad8x16/;
-
- add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad8x8/;
-
- add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad8x4/;
-
- add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad4x8/;
-
- add_proto qw/unsigned int vp9_highbd_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad4x4/;
-
- add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad64x64_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad32x64_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad64x32_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad32x16_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad16x32_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad32x32_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad16x16_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad16x8_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad8x16_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad8x8_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad8x4_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad4x8_avg/;
-
- add_proto qw/unsigned int vp9_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad4x4_avg/;
-
- add_proto qw/void vp9_highbd_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad64x64x3/;
-
- add_proto qw/void vp9_highbd_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x32x3/;
-
- add_proto qw/void vp9_highbd_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x16x3/;
-
- add_proto qw/void vp9_highbd_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x8x3/;
-
- add_proto qw/void vp9_highbd_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x16x3/;
-
- add_proto qw/void vp9_highbd_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x8x3/;
-
- add_proto qw/void vp9_highbd_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad4x4x3/;
-
- add_proto qw/void vp9_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad64x64x8/;
-
- add_proto qw/void vp9_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad32x32x8/;
-
- add_proto qw/void vp9_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad16x16x8/;
-
- add_proto qw/void vp9_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad16x8x8/;
-
- add_proto qw/void vp9_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad8x16x8/;
-
- add_proto qw/void vp9_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad8x8x8/;
-
- add_proto qw/void vp9_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad8x4x8/;
-
- add_proto qw/void vp9_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad4x8x8/;
-
- add_proto qw/void vp9_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
- specialize qw/vp9_highbd_sad4x4x8/;
-
- add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad64x64x4d/;
-
- add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x64x4d/;
-
- add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad64x32x4d/;
-
- add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x16x4d/;
-
- add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x32x4d/;
-
- add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x32x4d/;
-
- add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x16x4d/;
-
- add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x8x4d/;
-
- add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x16x4d/;
-
- add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x8x4d/;
-
- # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
- add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x4x4d/;
-
- add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad4x8x4d/;
-
- add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad4x4x4d/;
-
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse16x16/;
+ specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x16/;
@@ -1827,10 +1521,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_mse16x8/;
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse8x8/;
+ specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse16x16/;
+ specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x16/;
@@ -1839,10 +1533,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_mse16x8/;
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse8x8/;
+ specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse16x16/;
+ specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x16/;
@@ -1851,27 +1545,27 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_mse16x8/;
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse8x8/;
+ specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/vp9_highbd_block_error/;
+ specialize qw/vp9_highbd_block_error sse2/;
add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
specialize qw/vp9_highbd_subtract_block/;
- add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp/;
- add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp_32x32/;
- add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_highbd_quantize_b/;
+ add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_highbd_quantize_b sse2/;
- add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_highbd_quantize_b_32x32/;
+ add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
#
# Structured Similarity (SSIM)
@@ -1883,40 +1577,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# fdct functions
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_highbd_fht4x4/;
+ specialize qw/vp9_highbd_fht4x4 sse2/;
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_highbd_fht8x8/;
+ specialize qw/vp9_highbd_fht8x8 sse2/;
add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_highbd_fht16x16/;
+ specialize qw/vp9_highbd_fht16x16 sse2/;
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fwht4x4/;
add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct4x4/;
+ specialize qw/vp9_highbd_fdct4x4 sse2/;
add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct8x8_1/;
add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct8x8/;
+ specialize qw/vp9_highbd_fdct8x8 sse2/;
add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct16x16_1/;
add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct16x16/;
+ specialize qw/vp9_highbd_fdct16x16 sse2/;
add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct32x32_1/;
add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct32x32/;
+ specialize qw/vp9_highbd_fdct32x32 sse2/;
add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct32x32_rd/;
+ specialize qw/vp9_highbd_fdct32x32_rd sse2/;
add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_highbd_temporal_filter_apply/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
index e9711582303..161c381ad08 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
@@ -17,8 +17,10 @@
# include <intrin.h>
# define USE_MSC_INTRIN
# endif
+#if _MSC_VER < 1900
# define snprintf _snprintf
#endif
+#endif
#ifdef __cplusplus
extern "C" {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h
index 864579c03c3..12848fedeff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h
@@ -22,9 +22,13 @@
extern "C" {
#endif
+// Set maximum decode threads to be 8 due to the limit of frame buffers
+// and not enough semaphores in the emulation layer on windows.
+#define MAX_DECODE_THREADS 8
+
#if CONFIG_MULTITHREAD
-#if defined(_WIN32)
+#if defined(_WIN32) && !HAVE_PTHREAD_H
#include <errno.h> // NOLINT
#include <process.h> // NOLINT
#include <windows.h> // NOLINT
@@ -103,8 +107,8 @@ static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
static INLINE int pthread_cond_init(pthread_cond_t *const condition,
void* cond_attr) {
(void)cond_attr;
- condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
- condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+ condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+ condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
if (condition->waiting_sem_ == NULL ||
condition->received_sem_ == NULL ||
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c
new file mode 100644
index 00000000000..cba57ff41aa
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_loopfilter.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+ const int kMaxTryLocks = 4000;
+ int locked = 0;
+ int i;
+
+ for (i = 0; i < kMaxTryLocks; ++i) {
+ if (!pthread_mutex_trylock(mutex)) {
+ locked = 1;
+ break;
+ }
+ }
+
+ if (!locked)
+ pthread_mutex_lock(mutex);
+}
+#endif // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = lf_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+ mutex_lock(mutex);
+
+ while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+ pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)lf_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
+ const int sb_cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = lf_sync->sync_range;
+ int cur;
+ // Only signal when there are enough filtered SB for next row to run.
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync)
+ sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ mutex_lock(&lf_sync->mutex_[r]);
+
+ lf_sync->cur_sb_col[r] = cur;
+
+ pthread_cond_signal(&lf_sync->cond_[r]);
+ pthread_mutex_unlock(&lf_sync->mutex_[r]);
+ }
+#else
+ (void)lf_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+ VP9_COMMON *const cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
+ int start, int stop, int y_only,
+ VP9LfSync *const lf_sync) {
+ const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+ const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+ int mi_row, mi_col;
+ enum lf_path path;
+ if (y_only)
+ path = LF_PATH_444;
+ else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+ path = LF_PATH_420;
+ else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+ path = LF_PATH_444;
+ else
+ path = LF_PATH_SLOW;
+
+ for (mi_row = start; mi_row < stop;
+ mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+ MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+ const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+ const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+ LOOP_FILTER_MASK lfm;
+ int plane;
+
+ sync_read(lf_sync, r, c);
+
+ vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+ // TODO(JBB): Make setup_mask work for non 420.
+ vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+ &lfm);
+
+ vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+ for (plane = 1; plane < num_planes; ++plane) {
+ switch (path) {
+ case LF_PATH_420:
+ vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_444:
+ vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_SLOW:
+ vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ break;
+ }
+ }
+
+ sync_write(lf_sync, r, c, sb_cols);
+ }
+ }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+ lf_data->start, lf_data->stop, lf_data->y_only,
+ lf_sync);
+ return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+ VP9_COMMON *cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
+ int start, int stop, int y_only,
+ VP9Worker *workers, int nworkers,
+ VP9LfSync *lf_sync) {
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ // Number of superblock rows and cols
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ // Decoder may allocate more threads than number of tiles based on user's
+ // input.
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int num_workers = MIN(nworkers, tile_cols);
+ int i;
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_workers > lf_sync->num_workers) {
+ vp9_loop_filter_dealloc(lf_sync);
+ vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+ }
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+ // Set up loopfilter thread data.
+ // The decoder is capping num_workers because it has been observed that using
+ // more threads on the loopfilter than there are cores will hurt performance
+ // on Android. This is because the system will only schedule the tile decode
+ // workers on cores equal to the number of tile columns. Then if the decoder
+ // tries to use more threads for the loopfilter, it will hurt performance
+ // because of contention. If the multithreading code changes in the future
+ // then the number of workers used by the loopfilter should be revisited.
+ for (i = 0; i < num_workers; ++i) {
+ VP9Worker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
+ lf_data->start = start + i * MI_BLOCK_SIZE;
+ lf_data->stop = stop;
+ lf_data->y_only = y_only;
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+}
+
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+ VP9_COMMON *cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
+ int frame_filter_level,
+ int y_only, int partial_frame,
+ VP9Worker *workers, int num_workers,
+ VP9LfSync *lf_sync) {
+ int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+ if (!frame_filter_level) return;
+
+ start_mi_row = 0;
+ mi_rows_to_filter = cm->mi_rows;
+ if (partial_frame && cm->mi_rows > 8) {
+ start_mi_row = cm->mi_rows >> 1;
+ start_mi_row &= 0xfffffff8;
+ mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+ }
+ end_mi_row = start_mi_row + mi_rows_to_filter;
+ vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+ loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+ y_only, workers, num_workers, lf_sync);
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+ // nsync numbers are picked by testing. For example, for 4k
+ // video, using 4 gives best performance.
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+}
+
+// Allocate memory for lf row synchronization
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+ int width, int num_workers) {
+ lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+
+ CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+ vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+ if (lf_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->cond_,
+ vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+ if (lf_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->cond_[i], NULL);
+ }
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+ vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+ lf_sync->num_workers = num_workers;
+
+ CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+ vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+ // Set up nsync.
+ lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
+ if (lf_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ if (lf_sync->mutex_ != NULL) {
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_mutex_destroy(&lf_sync->mutex_[i]);
+ }
+ vpx_free(lf_sync->mutex_);
+ }
+ if (lf_sync->cond_ != NULL) {
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_cond_destroy(&lf_sync->cond_[i]);
+ }
+ vpx_free(lf_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ vpx_free(lf_sync->lfdata);
+ vpx_free(lf_sync->cur_sb_col);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ vp9_zero(*lf_sync);
+ }
+}
+
+// Accumulate frame counts.
+void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
+ int is_dec) {
+ int i, j, k, l, m;
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+ for (j = 0; j < INTRA_MODES; j++)
+ cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+
+ for (i = 0; i < INTRA_MODES; i++)
+ for (j = 0; j < INTRA_MODES; j++)
+ cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
+
+ for (i = 0; i < PARTITION_CONTEXTS; i++)
+ for (j = 0; j < PARTITION_TYPES; j++)
+ cm->counts.partition[i][j] += counts->partition[i][j];
+
+ if (is_dec) {
+ int n;
+ for (i = 0; i < TX_SIZES; i++)
+ for (j = 0; j < PLANE_TYPES; j++)
+ for (k = 0; k < REF_TYPES; k++)
+ for (l = 0; l < COEF_BANDS; l++)
+ for (m = 0; m < COEFF_CONTEXTS; m++) {
+ cm->counts.eob_branch[i][j][k][l][m] +=
+ counts->eob_branch[i][j][k][l][m];
+ for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+ cm->counts.coef[i][j][k][l][m][n] +=
+ counts->coef[i][j][k][l][m][n];
+ }
+ } else {
+ for (i = 0; i < TX_SIZES; i++)
+ for (j = 0; j < PLANE_TYPES; j++)
+ for (k = 0; k < REF_TYPES; k++)
+ for (l = 0; l < COEF_BANDS; l++)
+ for (m = 0; m < COEFF_CONTEXTS; m++)
+ cm->counts.eob_branch[i][j][k][l][m] +=
+ counts->eob_branch[i][j][k][l][m];
+ // In the encoder, cm->counts.coef is only updated at frame
+ // level, so not need to accumulate it here.
+ // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+ // cm->counts.coef[i][j][k][l][m][n] +=
+ // counts->coef[i][j][k][l][m][n];
+ }
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ for (j = 0; j < SWITCHABLE_FILTERS; j++)
+ cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
+
+ for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+ for (j = 0; j < INTER_MODES; j++)
+ cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+ for (j = 0; j < 2; j++)
+ cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
+
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+ for (j = 0; j < 2; j++)
+ cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
+
+ for (i = 0; i < REF_CONTEXTS; i++)
+ for (j = 0; j < 2; j++)
+ for (k = 0; k < 2; k++)
+ cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+
+ for (i = 0; i < REF_CONTEXTS; i++)
+ for (j = 0; j < 2; j++)
+ cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+
+ for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+ for (j = 0; j < TX_SIZES; j++)
+ cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+
+ for (j = 0; j < TX_SIZES - 1; j++)
+ cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+
+ for (j = 0; j < TX_SIZES - 2; j++)
+ cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+ }
+
+ for (i = 0; i < TX_SIZES; i++)
+ cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
+
+ for (i = 0; i < SKIP_CONTEXTS; i++)
+ for (j = 0; j < 2; j++)
+ cm->counts.skip[i][j] += counts->skip[i][j];
+
+ for (i = 0; i < MV_JOINTS; i++)
+ cm->counts.mv.joints[i] += counts->mv.joints[i];
+
+ for (k = 0; k < 2; k++) {
+ nmv_component_counts *comps = &cm->counts.mv.comps[k];
+ nmv_component_counts *comps_t = &counts->mv.comps[k];
+
+ for (i = 0; i < 2; i++) {
+ comps->sign[i] += comps_t->sign[i];
+ comps->class0_hp[i] += comps_t->class0_hp[i];
+ comps->hp[i] += comps_t->hp[i];
+ }
+
+ for (i = 0; i < MV_CLASSES; i++)
+ comps->classes[i] += comps_t->classes[i];
+
+ for (i = 0; i < CLASS0_SIZE; i++) {
+ comps->class0[i] += comps_t->class0[i];
+ for (j = 0; j < MV_FP_SIZE; j++)
+ comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+ }
+
+ for (i = 0; i < MV_OFFSET_BITS; i++)
+ for (j = 0; j < 2; j++)
+ comps->bits[i][j] += comps_t->bits[i][j];
+
+ for (i = 0; i < MV_FP_SIZE; i++)
+ comps->fp[i] += comps_t->fp[i];
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h
new file mode 100644
index 00000000000..3b3a6996ae9
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_thread.h"
+
+struct VP9Common;
+struct FRAME_COUNTS;
+
+// Loopfilter row synchronization
+typedef struct VP9LfSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ // Allocate memory to store the loop-filtered superblock index in each row.
+ int *cur_sb_col;
+ // The optimal sync_range for different resolution and platform should be
+ // determined by testing. Currently, it is chosen to be a power-of-2 number.
+ int sync_range;
+ int rows;
+
+ // Row-based parallel loopfilter data
+ LFWorkerData *lfdata;
+ int num_workers;
+} VP9LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,
+ int width, int num_workers);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+ struct VP9Common *cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
+ int frame_filter_level,
+ int y_only, int partial_frame,
+ VP9Worker *workers, int num_workers,
+ VP9LfSync *lf_sync);
+
+void vp9_accumulate_frame_counts(struct VP9Common *cm,
+ struct FRAME_COUNTS *counts, int is_dec);
+
+#endif // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
index 8c4a30353c1..7a20e0a9e73 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
@@ -36,24 +36,24 @@ void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
vp9_tile_set_col(tile, cm, col);
}
-void vp9_get_tile_n_bits(int mi_cols,
- int *min_log2_tile_cols, int *max_log2_tile_cols) {
- const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
- int min_log2 = 0, max_log2 = 0;
-
- // max
- while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
- ++max_log2;
- --max_log2;
- if (max_log2 < 0)
- max_log2 = 0;
-
- // min
- while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+static int get_min_log2_tile_cols(const int sb64_cols) {
+ int min_log2 = 0;
+ while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
++min_log2;
+ return min_log2;
+}
- assert(min_log2 <= max_log2);
+static int get_max_log2_tile_cols(const int sb64_cols) {
+ int max_log2 = 1;
+ while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+ ++max_log2;
+ return max_log2 - 1;
+}
- *min_log2_tile_cols = min_log2;
- *max_log2_tile_cols = max_log2;
+void vp9_get_tile_n_bits(int mi_cols,
+ int *min_log2_tile_cols, int *max_log2_tile_cols) {
+ const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+ *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+ *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+ assert(*min_log2_tile_cols <= *max_log2_tile_cols);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
index a0a599691c8..963023c53b1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -118,7 +118,7 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
+ DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \
vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7); \
@@ -126,7 +126,7 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} else { \
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
+ DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \
vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 1); \
@@ -259,7 +259,7 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
filter_x, x_step_q4, \
@@ -271,7 +271,7 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
filter_y, y_step_q4, \
w, h, bd); \
} else { \
- DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
filter_x, x_step_q4, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
index 721126c7825..b12d29c0ad8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
@@ -345,7 +345,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
%if ARCH_X86_64
INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
movd m2, [aboveq-2]
mova m0, [aboveq]
mova m1, [aboveq+16]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
index 7e63f389ead..1637f0e545a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
@@ -15,24 +15,38 @@
#include "vpx_ports/emmintrin_compat.h"
static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
- __m128i ubounded;
- __m128i lbounded;
- __m128i retval;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
- const __m128i max = _mm_subs_epi16(
- _mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
- const __m128i min = _mm_subs_epi16(zero, t80);
- ubounded = _mm_cmpgt_epi16(value, max);
- lbounded = _mm_cmplt_epi16(value, min);
- retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
- ubounded = _mm_and_si128(ubounded, max);
- lbounded = _mm_and_si128(lbounded, min);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_or_si128(retval, lbounded);
- return retval;
+ __m128i ubounded;
+ __m128i lbounded;
+ __m128i retval;
+
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i t80, max, min;
+
+ if (bd == 8) {
+ t80 = _mm_set1_epi16(0x80);
+ max = _mm_subs_epi16(
+ _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+ } else if (bd == 10) {
+ t80 = _mm_set1_epi16(0x200);
+ max = _mm_subs_epi16(
+ _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+ } else { // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+ max = _mm_subs_epi16(
+ _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+ }
+
+ min = _mm_subs_epi16(zero, t80);
+
+ ubounded = _mm_cmpgt_epi16(value, max);
+ lbounded = _mm_cmplt_epi16(value, min);
+ retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+ ubounded = _mm_and_si128(ubounded, max);
+ lbounded = _mm_and_si128(lbounded, min);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_or_si128(retval, lbounded);
+ return retval;
}
// TODO(debargha, peter): Break up large functions into smaller ones
@@ -45,14 +59,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
int bd) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
- const __m128i blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(
- _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
- const __m128i limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
- const __m128i thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(
- _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+ __m128i blimit, limit, thresh;
__m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
__m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
__m128i ps1, qs1, ps0, qs0;
@@ -68,6 +75,26 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
__m128i t4, t3, t80, t1;
__m128i eight, four;
+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ }
+
q4 = _mm_load_si128((__m128i *)(s + 4 * p));
p4 = _mm_load_si128((__m128i *)(s - 5 * p));
q3 = _mm_load_si128((__m128i *)(s + 3 * p));
@@ -121,7 +148,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
// highbd_filter4
t4 = _mm_set1_epi16(4);
t3 = _mm_set1_epi16(3);
- t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+ if (bd == 8)
+ t80 = _mm_set1_epi16(0x80);
+ else if (bd == 10)
+ t80 = _mm_set1_epi16(0x200);
+ else // bd == 12
+ t80 = _mm_set1_epi16(0x800);
+
t1 = _mm_set1_epi16(0x1);
ps1 = _mm_subs_epi16(p1, t80);
@@ -136,7 +169,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
filt = _mm_adds_epi16(filt, work_a);
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
filt = _mm_and_si128(filt, mask);
-
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
@@ -153,13 +185,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
filt = _mm_andnot_si128(hev, filt);
-
qs1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
t80);
ps1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
t80);
+
// end highbd_filter4
// loopfilter done
@@ -175,7 +207,14 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
flat = _mm_max_epi16(work, flat);
work = _mm_max_epi16(abs_p1p0, abs_q1q0);
flat = _mm_max_epi16(work, flat);
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
flat = _mm_cmpeq_epi16(flat, zero);
// end flat_mask4
@@ -215,7 +254,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
_mm_subs_epu16(q0, q7)));
flat2 = _mm_max_epi16(work, flat2);
- flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
+ if (bd == 8)
+ flat2 = _mm_subs_epu16(flat2, one);
+ else if (bd == 10)
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
flat2 = _mm_cmpeq_epi16(flat2, zero);
flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
// end highbd_flat_mask5
@@ -479,22 +524,14 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
const uint8_t *_limit,
const uint8_t *_thresh,
int count, int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op2, 16);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op1, 16);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op0, 16);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq2, 16);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
+ DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+ DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
const __m128i zero = _mm_set1_epi16(0);
- const __m128i blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
- bd - 8);
- const __m128i limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero),
- bd - 8);
- const __m128i thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero),
- bd - 8);
+ __m128i blimit, limit, thresh;
__m128i mask, hev, flat;
__m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
__m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
@@ -512,18 +549,43 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
- const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+ __m128i t80;
const __m128i t1 = _mm_set1_epi16(0x1);
- const __m128i ps1 = _mm_subs_epi16(p1, t80);
- const __m128i ps0 = _mm_subs_epi16(p0, t80);
- const __m128i qs0 = _mm_subs_epi16(q0, t80);
- const __m128i qs1 = _mm_subs_epi16(q1, t80);
+ __m128i ps1, ps0, qs0, qs1;
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
(void)count;
+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ t80 = _mm_set1_epi16(0x200);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ t80 = _mm_set1_epi16(0x800);
+ }
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+
// filter_mask and hev_mask
abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
_mm_subs_epu16(p0, p1));
@@ -575,7 +637,14 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
flat = _mm_max_epi16(work, flat);
flat = _mm_max_epi16(abs_p1p0, flat);
flat = _mm_max_epi16(abs_q1q0, flat);
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ if (bd == 8)
+ flat = _mm_subs_epu16(flat, one);
+ else if (bd == 10)
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+ else // bd == 12
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
@@ -706,15 +775,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
const uint8_t *_thresh,
int count, int bd) {
const __m128i zero = _mm_set1_epi16(0);
- const __m128i blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(
- _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
- const __m128i limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(
- _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
- const __m128i thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(
- _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+ __m128i blimit, limit, thresh;
__m128i mask, hev, flat;
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
__m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
@@ -737,30 +798,63 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
__m128i work;
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
- const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
- const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
- const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
- const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
+ __m128i t80;
+ __m128i tff80;
+ __m128i tffe0;
+ __m128i t1f;
// equivalent to shifting 0x1f left by bitdepth - 8
// and setting new bits to 1
const __m128i t1 = _mm_set1_epi16(0x1);
- const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
+ __m128i t7f;
// equivalent to shifting 0x7f left by bitdepth - 8
// and setting new bits to 1
- const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
+ __m128i ps1, ps0, qs0, qs1;
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
(void)count;
+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ tff80 = _mm_set1_epi16(0xff80);
+ tffe0 = _mm_set1_epi16(0xffe0);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+ }
+
+ ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
// filter_mask and hev_mask
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu16(flat, thresh);
@@ -796,6 +890,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
// (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
@@ -964,7 +1059,7 @@ void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
const uint8_t *limit,
const uint8_t *thresh,
int count, int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
uint16_t *src[1];
uint16_t *dst[1];
(void)count;
@@ -994,7 +1089,7 @@ void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
const uint8_t *limit1,
const uint8_t *thresh1,
int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
uint16_t *src[2];
uint16_t *dst[2];
@@ -1018,7 +1113,7 @@ void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
const uint8_t *limit,
const uint8_t *thresh,
int count, int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
uint16_t *src[1];
uint16_t *dst[1];
(void)count;
@@ -1048,7 +1143,7 @@ void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
const uint8_t *limit1,
const uint8_t *thresh1,
int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+ DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
uint16_t *src[2];
uint16_t *dst[2];
@@ -1073,7 +1168,7 @@ void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
const uint8_t *limit,
const uint8_t *thresh,
int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 16);
+ DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
uint16_t *src[2];
uint16_t *dst[2];
@@ -1103,7 +1198,7 @@ void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
const uint8_t *limit,
const uint8_t *thresh,
int bd) {
- DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 256);
+ DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
// Transpose 16x16
highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index df609872b75..0385c7955c9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -9,6 +9,7 @@
*/
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+#include "vp9/common/vp9_idct.h"
#define RECON_AND_STORE4X4(dest, in_x) \
{ \
@@ -16,17 +17,16 @@
d0 = _mm_unpacklo_epi8(d0, zero); \
d0 = _mm_add_epi16(in_x, d0); \
d0 = _mm_packus_epi16(d0, d0); \
- *(int *)dest = _mm_cvtsi128_si32(d0); \
- dest += stride; \
+ *(int *)(dest) = _mm_cvtsi128_si32(d0); \
}
void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
- (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+ const __m128i cst = _mm_setr_epi16(
+ (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+ (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64);
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i input0, input1, input2, input3;
@@ -125,28 +125,28 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// Reconstruction and Store
{
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *) (dest + stride)));
- d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
- *(const int *) (dest + stride * 3)), d2);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, input2);
- d2 = _mm_add_epi16(d2, input3);
- d0 = _mm_packus_epi16(d0, d2);
- // store input0
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store input1
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store input2
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- // store input3
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi32(d0,
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d2 = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, input2);
+ d2 = _mm_add_epi16(d2, input3);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store input0
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store input2
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ // store input3
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
}
}
@@ -161,10 +161,10 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
dc_value = _mm_set1_epi16(a);
- RECON_AND_STORE4X4(dest, dc_value);
- RECON_AND_STORE4X4(dest, dc_value);
- RECON_AND_STORE4X4(dest, dc_value);
- RECON_AND_STORE4X4(dest, dc_value);
+ RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+ RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
}
static INLINE void transpose_4x4(__m128i *res) {
@@ -216,7 +216,7 @@ static void iadst4_sse2(__m128i *in) {
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
const __m128i kZero = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8], in7;
@@ -266,8 +266,8 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- in[0]= _mm_loadu_si128((const __m128i *)(input));
- in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
+ in[0] = _mm_loadu_si128((const __m128i *)(input));
+ in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
switch (tx_type) {
case 0: // DCT_DCT
@@ -300,28 +300,28 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
// Reconstruction and Store
{
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *) (dest + stride)));
- d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
- *(const int *) (dest + stride * 3)));
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, in[0]);
- d2 = _mm_add_epi16(d2, in[1]);
- d0 = _mm_packus_epi16(d0, d2);
- // store result[0]
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store result[1]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store result[2]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- // store result[3]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi32(d0,
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d2 = _mm_unpacklo_epi32(
+ d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, in[0]);
+ d2 = _mm_add_epi16(d2, in[1]);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store result[0]
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store result[1]
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store result[2]
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+ // store result[3]
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
}
}
@@ -516,7 +516,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<4);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
@@ -550,7 +550,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// 4-stage 1D idct8x8
IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ in0, in1, in2, in3, in4, in5, in6, in7);
}
// Final rounding and shift
@@ -572,14 +572,14 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
- RECON_AND_STORE(dest, in0);
- RECON_AND_STORE(dest, in1);
- RECON_AND_STORE(dest, in2);
- RECON_AND_STORE(dest, in3);
- RECON_AND_STORE(dest, in4);
- RECON_AND_STORE(dest, in5);
- RECON_AND_STORE(dest, in6);
- RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
}
void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
@@ -593,14 +593,14 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
dc_value = _mm_set1_epi16(a);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
}
static void idct8_sse2(__m128i *in) {
@@ -625,7 +625,7 @@ static void idct8_sse2(__m128i *in) {
// 4-stage 1D idct8x8
IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+ in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
}
static void iadst8_sse2(__m128i *in) {
@@ -641,7 +641,7 @@ static void iadst8_sse2(__m128i *in) {
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__const_0 = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -655,14 +655,14 @@ static void iadst8_sse2(__m128i *in) {
array_transpose_8x8(in, in);
// properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
// column transformation
// stage 1
@@ -856,12 +856,11 @@ static void iadst8_sse2(__m128i *in) {
in[7] = _mm_sub_epi16(k__const_0, s1);
}
-
void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[8];
const __m128i zero = _mm_setzero_si128();
- const __m128i final_rounding = _mm_set1_epi16(1<<4);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
in[0] = _mm_load_si128((const __m128i *)input);
@@ -914,20 +913,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
in[6] = _mm_srai_epi16(in[6], 5);
in[7] = _mm_srai_epi16(in[7], 5);
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
}
void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<4);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
@@ -952,7 +951,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// 8x4 Transpose
TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
// Stage1
- { //NOLINT
+ {
const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
@@ -975,7 +974,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
}
// Stage2
- { //NOLINT
+ {
const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
@@ -1005,7 +1004,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
}
// Stage3
- { //NOLINT
+ {
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
@@ -1034,7 +1033,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
- in0, in1, in2, in3, in4, in5, in6, in7);
+ in0, in1, in2, in3, in4, in5, in6, in7);
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
in1 = _mm_adds_epi16(in1, final_rounding);
@@ -1054,14 +1053,14 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
- RECON_AND_STORE(dest, in0);
- RECON_AND_STORE(dest, in1);
- RECON_AND_STORE(dest, in2);
- RECON_AND_STORE(dest, in3);
- RECON_AND_STORE(dest, in4);
- RECON_AND_STORE(dest, in5);
- RECON_AND_STORE(dest, in6);
- RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
}
#define IDCT16 \
@@ -1304,7 +1303,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1343,130 +1342,86 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
curr1 = l;
for (i = 0; i < 2; i++) {
- // 1-D idct
-
- // Load input data.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
- in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
- in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
- in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
- in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
- in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
- in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
- in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
- in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
- array_transpose_8x8(in, in);
- array_transpose_8x8(in+8, in+8);
-
- IDCT16
-
- // Stage7
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- curr1 = r;
- input += 128;
+ // 1-D idct
+
+ // Load input data.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+
+ IDCT16
+
+ // Stage7
+ curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+ curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+ curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+ curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+ curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+ curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+ curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+ curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+ curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ curr1 = r;
+ input += 128;
}
for (i = 0; i < 2; i++) {
- // 1-D idct
- array_transpose_8x8(l+i*8, in);
- array_transpose_8x8(r+i*8, in+8);
-
- IDCT16
-
- // 2-D
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+ int j;
+ // 1-D idct
+ array_transpose_8x8(l + i * 8, in);
+ array_transpose_8x8(r + i * 8, in + 8);
+
+ IDCT16
+ // 2-D
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ for (j = 0; j < 16; ++j) {
// Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
- RECON_AND_STORE(dest, in[8]);
- RECON_AND_STORE(dest, in[9]);
- RECON_AND_STORE(dest, in[10]);
- RECON_AND_STORE(dest, in[11]);
- RECON_AND_STORE(dest, in[12]);
- RECON_AND_STORE(dest, in[13]);
- RECON_AND_STORE(dest, in[14]);
- RECON_AND_STORE(dest, in[15]);
-
- dest += 8 - (stride * 16);
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
}
}
@@ -1482,23 +1437,23 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
dc_value = _mm_set1_epi16(a);
for (i = 0; i < 2; ++i) {
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- dest += 8 - (stride * 16);
+ RECON_AND_STORE(dest + 0 * stride, dc_value);
+ RECON_AND_STORE(dest + 1 * stride, dc_value);
+ RECON_AND_STORE(dest + 2 * stride, dc_value);
+ RECON_AND_STORE(dest + 3 * stride, dc_value);
+ RECON_AND_STORE(dest + 4 * stride, dc_value);
+ RECON_AND_STORE(dest + 5 * stride, dc_value);
+ RECON_AND_STORE(dest + 6 * stride, dc_value);
+ RECON_AND_STORE(dest + 7 * stride, dc_value);
+ RECON_AND_STORE(dest + 8 * stride, dc_value);
+ RECON_AND_STORE(dest + 9 * stride, dc_value);
+ RECON_AND_STORE(dest + 10 * stride, dc_value);
+ RECON_AND_STORE(dest + 11 * stride, dc_value);
+ RECON_AND_STORE(dest + 12 * stride, dc_value);
+ RECON_AND_STORE(dest + 13 * stride, dc_value);
+ RECON_AND_STORE(dest + 14 * stride, dc_value);
+ RECON_AND_STORE(dest + 15 * stride, dc_value);
+ dest += 8;
}
}
@@ -1530,8 +1485,8 @@ static void iadst16_8col(__m128i *in) {
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1985,7 +1940,7 @@ static void idct16_8col(__m128i *in) {
const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -2366,7 +2321,7 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -2405,7 +2360,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
// Stage2
{
const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
@@ -2566,7 +2521,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
// Second 1-D inverse transform, performed per 8x16 block
for (i = 0; i < 2; i++) {
- array_transpose_4X8(l + 8*i, in);
+ int j;
+ array_transpose_4X8(l + 8 * i, in);
IDCT16_10
@@ -2588,59 +2544,14 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
in[14] = _mm_sub_epi16(stp2_1, stp1_14);
in[15] = _mm_sub_epi16(stp2_0, stp1_15);
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
- RECON_AND_STORE(dest, in[8]);
- RECON_AND_STORE(dest, in[9]);
- RECON_AND_STORE(dest, in[10]);
- RECON_AND_STORE(dest, in[11]);
- RECON_AND_STORE(dest, in[12]);
- RECON_AND_STORE(dest, in[13]);
- RECON_AND_STORE(dest, in[14]);
- RECON_AND_STORE(dest, in[15]);
-
- dest += 8 - (stride * 16);
+ for (j = 0; j < 16; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
}
}
@@ -3285,7 +3196,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
// Only upper-left 8x8 has non-zero coeff
void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -3386,9 +3297,9 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
LOAD_DQCOEFF(in[31], input);
array_transpose_8x8(in, in);
- array_transpose_8x8(in+8, in+8);
- array_transpose_8x8(in+16, in+16);
- array_transpose_8x8(in+24, in+24);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
IDCT32
@@ -3426,153 +3337,61 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
col[30] = _mm_sub_epi16(stp1_1, stp1_30);
col[31] = _mm_sub_epi16(stp1_0, stp1_31);
for (i = 0; i < 4; i++) {
- const __m128i zero = _mm_setzero_si128();
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col+i*8, in);
- IDCT32_34
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
+ int j;
+ const __m128i zero = _mm_setzero_si128();
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ IDCT32_34
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
// Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
- in[16] = _mm_adds_epi16(in[16], final_rounding);
- in[17] = _mm_adds_epi16(in[17], final_rounding);
- in[18] = _mm_adds_epi16(in[18], final_rounding);
- in[19] = _mm_adds_epi16(in[19], final_rounding);
- in[20] = _mm_adds_epi16(in[20], final_rounding);
- in[21] = _mm_adds_epi16(in[21], final_rounding);
- in[22] = _mm_adds_epi16(in[22], final_rounding);
- in[23] = _mm_adds_epi16(in[23], final_rounding);
- in[24] = _mm_adds_epi16(in[24], final_rounding);
- in[25] = _mm_adds_epi16(in[25], final_rounding);
- in[26] = _mm_adds_epi16(in[26], final_rounding);
- in[27] = _mm_adds_epi16(in[27], final_rounding);
- in[28] = _mm_adds_epi16(in[28], final_rounding);
- in[29] = _mm_adds_epi16(in[29], final_rounding);
- in[30] = _mm_adds_epi16(in[30], final_rounding);
- in[31] = _mm_adds_epi16(in[31], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
- in[16] = _mm_srai_epi16(in[16], 6);
- in[17] = _mm_srai_epi16(in[17], 6);
- in[18] = _mm_srai_epi16(in[18], 6);
- in[19] = _mm_srai_epi16(in[19], 6);
- in[20] = _mm_srai_epi16(in[20], 6);
- in[21] = _mm_srai_epi16(in[21], 6);
- in[22] = _mm_srai_epi16(in[22], 6);
- in[23] = _mm_srai_epi16(in[23], 6);
- in[24] = _mm_srai_epi16(in[24], 6);
- in[25] = _mm_srai_epi16(in[25], 6);
- in[26] = _mm_srai_epi16(in[26], 6);
- in[27] = _mm_srai_epi16(in[27], 6);
- in[28] = _mm_srai_epi16(in[28], 6);
- in[29] = _mm_srai_epi16(in[29], 6);
- in[30] = _mm_srai_epi16(in[30], 6);
- in[31] = _mm_srai_epi16(in[31], 6);
-
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
- RECON_AND_STORE(dest, in[8]);
- RECON_AND_STORE(dest, in[9]);
- RECON_AND_STORE(dest, in[10]);
- RECON_AND_STORE(dest, in[11]);
- RECON_AND_STORE(dest, in[12]);
- RECON_AND_STORE(dest, in[13]);
- RECON_AND_STORE(dest, in[14]);
- RECON_AND_STORE(dest, in[15]);
- RECON_AND_STORE(dest, in[16]);
- RECON_AND_STORE(dest, in[17]);
- RECON_AND_STORE(dest, in[18]);
- RECON_AND_STORE(dest, in[19]);
- RECON_AND_STORE(dest, in[20]);
- RECON_AND_STORE(dest, in[21]);
- RECON_AND_STORE(dest, in[22]);
- RECON_AND_STORE(dest, in[23]);
- RECON_AND_STORE(dest, in[24]);
- RECON_AND_STORE(dest, in[25]);
- RECON_AND_STORE(dest, in[26]);
- RECON_AND_STORE(dest, in[27]);
- RECON_AND_STORE(dest, in[28]);
- RECON_AND_STORE(dest, in[29]);
- RECON_AND_STORE(dest, in[30]);
- RECON_AND_STORE(dest, in[31]);
-
- dest += 8 - (stride * 32);
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
}
+
+ dest += 8;
}
+}
void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
// idct constants for each stage
@@ -3639,304 +3458,211 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
for (i = 0; i < 4; i++) {
i32 = (i << 5);
- // First 1-D idct
- // Load input data.
- LOAD_DQCOEFF(in[0], input);
- LOAD_DQCOEFF(in[8], input);
- LOAD_DQCOEFF(in[16], input);
- LOAD_DQCOEFF(in[24], input);
- LOAD_DQCOEFF(in[1], input);
- LOAD_DQCOEFF(in[9], input);
- LOAD_DQCOEFF(in[17], input);
- LOAD_DQCOEFF(in[25], input);
- LOAD_DQCOEFF(in[2], input);
- LOAD_DQCOEFF(in[10], input);
- LOAD_DQCOEFF(in[18], input);
- LOAD_DQCOEFF(in[26], input);
- LOAD_DQCOEFF(in[3], input);
- LOAD_DQCOEFF(in[11], input);
- LOAD_DQCOEFF(in[19], input);
- LOAD_DQCOEFF(in[27], input);
-
- LOAD_DQCOEFF(in[4], input);
- LOAD_DQCOEFF(in[12], input);
- LOAD_DQCOEFF(in[20], input);
- LOAD_DQCOEFF(in[28], input);
- LOAD_DQCOEFF(in[5], input);
- LOAD_DQCOEFF(in[13], input);
- LOAD_DQCOEFF(in[21], input);
- LOAD_DQCOEFF(in[29], input);
- LOAD_DQCOEFF(in[6], input);
- LOAD_DQCOEFF(in[14], input);
- LOAD_DQCOEFF(in[22], input);
- LOAD_DQCOEFF(in[30], input);
- LOAD_DQCOEFF(in[7], input);
- LOAD_DQCOEFF(in[15], input);
- LOAD_DQCOEFF(in[23], input);
- LOAD_DQCOEFF(in[31], input);
-
- // checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in[0], in[1]);
- zero_idx[1] = _mm_or_si128(in[2], in[3]);
- zero_idx[2] = _mm_or_si128(in[4], in[5]);
- zero_idx[3] = _mm_or_si128(in[6], in[7]);
- zero_idx[4] = _mm_or_si128(in[8], in[9]);
- zero_idx[5] = _mm_or_si128(in[10], in[11]);
- zero_idx[6] = _mm_or_si128(in[12], in[13]);
- zero_idx[7] = _mm_or_si128(in[14], in[15]);
- zero_idx[8] = _mm_or_si128(in[16], in[17]);
- zero_idx[9] = _mm_or_si128(in[18], in[19]);
- zero_idx[10] = _mm_or_si128(in[20], in[21]);
- zero_idx[11] = _mm_or_si128(in[22], in[23]);
- zero_idx[12] = _mm_or_si128(in[24], in[25]);
- zero_idx[13] = _mm_or_si128(in[26], in[27]);
- zero_idx[14] = _mm_or_si128(in[28], in[29]);
- zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- }
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(in, in);
- array_transpose_8x8(in+8, in+8);
- array_transpose_8x8(in+16, in+16);
- array_transpose_8x8(in+24, in+24);
-
- IDCT32
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ // First 1-D idct
+ // Load input data.
+ LOAD_DQCOEFF(in[0], input);
+ LOAD_DQCOEFF(in[8], input);
+ LOAD_DQCOEFF(in[16], input);
+ LOAD_DQCOEFF(in[24], input);
+ LOAD_DQCOEFF(in[1], input);
+ LOAD_DQCOEFF(in[9], input);
+ LOAD_DQCOEFF(in[17], input);
+ LOAD_DQCOEFF(in[25], input);
+ LOAD_DQCOEFF(in[2], input);
+ LOAD_DQCOEFF(in[10], input);
+ LOAD_DQCOEFF(in[18], input);
+ LOAD_DQCOEFF(in[26], input);
+ LOAD_DQCOEFF(in[3], input);
+ LOAD_DQCOEFF(in[11], input);
+ LOAD_DQCOEFF(in[19], input);
+ LOAD_DQCOEFF(in[27], input);
+
+ LOAD_DQCOEFF(in[4], input);
+ LOAD_DQCOEFF(in[12], input);
+ LOAD_DQCOEFF(in[20], input);
+ LOAD_DQCOEFF(in[28], input);
+ LOAD_DQCOEFF(in[5], input);
+ LOAD_DQCOEFF(in[13], input);
+ LOAD_DQCOEFF(in[21], input);
+ LOAD_DQCOEFF(in[29], input);
+ LOAD_DQCOEFF(in[6], input);
+ LOAD_DQCOEFF(in[14], input);
+ LOAD_DQCOEFF(in[22], input);
+ LOAD_DQCOEFF(in[30], input);
+ LOAD_DQCOEFF(in[7], input);
+ LOAD_DQCOEFF(in[15], input);
+ LOAD_DQCOEFF(in[23], input);
+ LOAD_DQCOEFF(in[31], input);
+
+ // checking if all entries are zero
+ zero_idx[0] = _mm_or_si128(in[0], in[1]);
+ zero_idx[1] = _mm_or_si128(in[2], in[3]);
+ zero_idx[2] = _mm_or_si128(in[4], in[5]);
+ zero_idx[3] = _mm_or_si128(in[6], in[7]);
+ zero_idx[4] = _mm_or_si128(in[8], in[9]);
+ zero_idx[5] = _mm_or_si128(in[10], in[11]);
+ zero_idx[6] = _mm_or_si128(in[12], in[13]);
+ zero_idx[7] = _mm_or_si128(in[14], in[15]);
+ zero_idx[8] = _mm_or_si128(in[16], in[17]);
+ zero_idx[9] = _mm_or_si128(in[18], in[19]);
+ zero_idx[10] = _mm_or_si128(in[20], in[21]);
+ zero_idx[11] = _mm_or_si128(in[22], in[23]);
+ zero_idx[12] = _mm_or_si128(in[24], in[25]);
+ zero_idx[13] = _mm_or_si128(in[26], in[27]);
+ zero_idx[14] = _mm_or_si128(in[28], in[29]);
+ zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+ zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+ zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+ zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
}
- for (i = 0; i < 4; i++) {
- // Second 1-D idct
- j = i << 3;
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col+j, in);
- array_transpose_8x8(col+j+32, in+8);
- array_transpose_8x8(col+j+64, in+16);
- array_transpose_8x8(col+j+96, in+24);
-
- IDCT32
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ IDCT32
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ }
+ for (i = 0; i < 4; i++) {
+ // Second 1-D idct
+ j = i << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ IDCT32
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+ for (j = 0; j < 32; ++j) {
// Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
- in[16] = _mm_adds_epi16(in[16], final_rounding);
- in[17] = _mm_adds_epi16(in[17], final_rounding);
- in[18] = _mm_adds_epi16(in[18], final_rounding);
- in[19] = _mm_adds_epi16(in[19], final_rounding);
- in[20] = _mm_adds_epi16(in[20], final_rounding);
- in[21] = _mm_adds_epi16(in[21], final_rounding);
- in[22] = _mm_adds_epi16(in[22], final_rounding);
- in[23] = _mm_adds_epi16(in[23], final_rounding);
- in[24] = _mm_adds_epi16(in[24], final_rounding);
- in[25] = _mm_adds_epi16(in[25], final_rounding);
- in[26] = _mm_adds_epi16(in[26], final_rounding);
- in[27] = _mm_adds_epi16(in[27], final_rounding);
- in[28] = _mm_adds_epi16(in[28], final_rounding);
- in[29] = _mm_adds_epi16(in[29], final_rounding);
- in[30] = _mm_adds_epi16(in[30], final_rounding);
- in[31] = _mm_adds_epi16(in[31], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
- in[16] = _mm_srai_epi16(in[16], 6);
- in[17] = _mm_srai_epi16(in[17], 6);
- in[18] = _mm_srai_epi16(in[18], 6);
- in[19] = _mm_srai_epi16(in[19], 6);
- in[20] = _mm_srai_epi16(in[20], 6);
- in[21] = _mm_srai_epi16(in[21], 6);
- in[22] = _mm_srai_epi16(in[22], 6);
- in[23] = _mm_srai_epi16(in[23], 6);
- in[24] = _mm_srai_epi16(in[24], 6);
- in[25] = _mm_srai_epi16(in[25], 6);
- in[26] = _mm_srai_epi16(in[26], 6);
- in[27] = _mm_srai_epi16(in[27], 6);
- in[28] = _mm_srai_epi16(in[28], 6);
- in[29] = _mm_srai_epi16(in[29], 6);
- in[30] = _mm_srai_epi16(in[30], 6);
- in[31] = _mm_srai_epi16(in[31], 6);
-
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
- RECON_AND_STORE(dest, in[8]);
- RECON_AND_STORE(dest, in[9]);
- RECON_AND_STORE(dest, in[10]);
- RECON_AND_STORE(dest, in[11]);
- RECON_AND_STORE(dest, in[12]);
- RECON_AND_STORE(dest, in[13]);
- RECON_AND_STORE(dest, in[14]);
- RECON_AND_STORE(dest, in[15]);
- RECON_AND_STORE(dest, in[16]);
- RECON_AND_STORE(dest, in[17]);
- RECON_AND_STORE(dest, in[18]);
- RECON_AND_STORE(dest, in[19]);
- RECON_AND_STORE(dest, in[20]);
- RECON_AND_STORE(dest, in[21]);
- RECON_AND_STORE(dest, in[22]);
- RECON_AND_STORE(dest, in[23]);
- RECON_AND_STORE(dest, in[24]);
- RECON_AND_STORE(dest, in[25]);
- RECON_AND_STORE(dest, in[26]);
- RECON_AND_STORE(dest, in[27]);
- RECON_AND_STORE(dest, in[28]);
- RECON_AND_STORE(dest, in[29]);
- RECON_AND_STORE(dest, in[30]);
- RECON_AND_STORE(dest, in[31]);
-
- dest += 8 - (stride * 32);
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
}
-} //NOLINT
+
+ dest += 8;
+ }
+}
void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
@@ -3950,38 +3676,580 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
dc_value = _mm_set1_epi16(a);
for (i = 0; i < 4; ++i) {
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- RECON_AND_STORE(dest, dc_value);
- dest += 8 - (stride * 32);
+ int j;
+ for (j = 0; j < 32; ++j) {
+ RECON_AND_STORE(dest + j * stride, dc_value);
+ }
+ dest += 8;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+ __m128i ubounded, retval;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+ ubounded = _mm_cmpgt_epi16(value, max);
+ retval = _mm_andnot_si128(ubounded, value);
+ ubounded = _mm_and_si128(ubounded, max);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+ return retval;
+}
+
+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ __m128i inptr[4];
+ __m128i sign_bits[2];
+ __m128i temp_mm, min_input, max_input;
+ int test;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int optimised_cols = 0;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i max = _mm_set1_epi16(12043);
+ const __m128i min = _mm_set1_epi16(-12043);
+ // Load input into __m128i
+ inptr[0] = _mm_loadu_si128((const __m128i *)input);
+ inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+ inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+ inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+ // Pack to 16 bits
+ inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+ inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (!test) {
+ // Do the row transform
+ idct4_sse2(inptr);
+
+ // Check the min & max values
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (test) {
+ transpose_4x4(inptr);
+ sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+ sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+ inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+ inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+ inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+ inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+ _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+ _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+ _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+ _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp9_highbd_idct4(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+ }
+
+ if (optimised_cols) {
+ idct4_sse2(inptr);
+
+ // Final round and shift
+ inptr[0] = _mm_add_epi16(inptr[0], eight);
+ inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+ inptr[0] = _mm_srai_epi16(inptr[0], 4);
+ inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi64(
+ d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+ // store input0
+ _mm_storel_epi64((__m128i *)dest, d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ // store input2
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ // store input3
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[4], temp_out[4];
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ vp9_highbd_idct4(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+ }
+}
+
+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_8x8(inptr, inptr);
+ for (i = 0; i < 8; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 8; ++i) {
+ vp9_highbd_idct8(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp9_highbd_idct8(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // only first 4 row has non-zero coefs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_4X8(inptr, inptr);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp9_highbd_idct8(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ vp9_highbd_idct8(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_16x16(inptr, inptr + 16);
+ for (i = 0; i < 16; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 16; ++i) {
+ vp9_highbd_idct16(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
+ inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+ inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
+ inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp9_highbd_idct16(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
}
}
+
+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // Since all non-zero dct coefficients are in upper-left 4x4 area,
+ // we only need to consider first 4 rows here.
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform (N.B. This transposes inptr)
+ idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 16; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_8x8(inptr, inptr);
+ array_transpose_8x8(inptr + 8, inptr + 16);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vp9_highbd_idct16(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
+ inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+ inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
+ inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ vp9_highbd_idct16(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
index 0f179b49a57..984363d4035 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -115,7 +115,6 @@ static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
d0 = _mm_add_epi16(in_x, d0); \
d0 = _mm_packus_epi16(d0, d0); \
_mm_storel_epi64((__m128i *)(dest), d0); \
- dest += stride; \
}
static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -156,20 +155,20 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
in[14] = _mm_srai_epi16(in[14], 6);
in[15] = _mm_srai_epi16(in[15], 6);
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
- RECON_AND_STORE(dest, in[8]);
- RECON_AND_STORE(dest, in[9]);
- RECON_AND_STORE(dest, in[10]);
- RECON_AND_STORE(dest, in[11]);
- RECON_AND_STORE(dest, in[12]);
- RECON_AND_STORE(dest, in[13]);
- RECON_AND_STORE(dest, in[14]);
- RECON_AND_STORE(dest, in[15]);
+ RECON_AND_STORE(dest + 0 * stride, in[0]);
+ RECON_AND_STORE(dest + 1 * stride, in[1]);
+ RECON_AND_STORE(dest + 2 * stride, in[2]);
+ RECON_AND_STORE(dest + 3 * stride, in[3]);
+ RECON_AND_STORE(dest + 4 * stride, in[4]);
+ RECON_AND_STORE(dest + 5 * stride, in[5]);
+ RECON_AND_STORE(dest + 6 * stride, in[6]);
+ RECON_AND_STORE(dest + 7 * stride, in[7]);
+ RECON_AND_STORE(dest + 8 * stride, in[8]);
+ RECON_AND_STORE(dest + 9 * stride, in[9]);
+ RECON_AND_STORE(dest + 10 * stride, in[10]);
+ RECON_AND_STORE(dest + 11 * stride, in[11]);
+ RECON_AND_STORE(dest + 12 * stride, in[12]);
+ RECON_AND_STORE(dest + 13 * stride, in[13]);
+ RECON_AND_STORE(dest + 14 * stride, in[14]);
+ RECON_AND_STORE(dest + 15 * stride, in[15]);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c
deleted file mode 100644
index 73bf5d1d78e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
-#include <tmmintrin.h> // SSSE3
-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-
-static void idct16_8col(__m128i *in, int round) {
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i v[16], u[16], s[16], t[16];
-
- // stage 1
- s[0] = in[0];
- s[1] = in[8];
- s[2] = in[4];
- s[3] = in[12];
- s[4] = in[2];
- s[5] = in[10];
- s[6] = in[6];
- s[7] = in[14];
- s[8] = in[1];
- s[9] = in[9];
- s[10] = in[5];
- s[11] = in[13];
- s[12] = in[3];
- s[13] = in[11];
- s[14] = in[7];
- s[15] = in[15];
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[15]);
- u[1] = _mm_unpackhi_epi16(s[8], s[15]);
- u[2] = _mm_unpacklo_epi16(s[9], s[14]);
- u[3] = _mm_unpackhi_epi16(s[9], s[14]);
- u[4] = _mm_unpacklo_epi16(s[10], s[13]);
- u[5] = _mm_unpackhi_epi16(s[10], s[13]);
- u[6] = _mm_unpacklo_epi16(s[11], s[12]);
- u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[8] = _mm_packs_epi32(u[0], u[1]);
- s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
- s[14] = _mm_packs_epi32(u[6], u[7]);
- s[10] = _mm_packs_epi32(u[8], u[9]);
- s[13] = _mm_packs_epi32(u[10], u[11]);
- s[11] = _mm_packs_epi32(u[12], u[13]);
- s[12] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- t[0] = s[0];
- t[1] = s[1];
- t[2] = s[2];
- t[3] = s[3];
- u[0] = _mm_unpacklo_epi16(s[4], s[7]);
- u[1] = _mm_unpackhi_epi16(s[4], s[7]);
- u[2] = _mm_unpacklo_epi16(s[5], s[6]);
- u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[4] = _mm_packs_epi32(u[0], u[1]);
- t[7] = _mm_packs_epi32(u[2], u[3]);
- t[5] = _mm_packs_epi32(u[4], u[5]);
- t[6] = _mm_packs_epi32(u[6], u[7]);
- t[8] = _mm_add_epi16(s[8], s[9]);
- t[9] = _mm_sub_epi16(s[8], s[9]);
- t[10] = _mm_sub_epi16(s[11], s[10]);
- t[11] = _mm_add_epi16(s[10], s[11]);
- t[12] = _mm_add_epi16(s[12], s[13]);
- t[13] = _mm_sub_epi16(s[12], s[13]);
- t[14] = _mm_sub_epi16(s[15], s[14]);
- t[15] = _mm_add_epi16(s[14], s[15]);
-
- // stage 4
- u[0] = _mm_add_epi16(t[0], t[1]);
- u[1] = _mm_sub_epi16(t[0], t[1]);
- u[2] = _mm_unpacklo_epi16(t[2], t[3]);
- u[3] = _mm_unpackhi_epi16(t[2], t[3]);
- u[4] = _mm_unpacklo_epi16(t[9], t[14]);
- u[5] = _mm_unpackhi_epi16(t[9], t[14]);
- u[6] = _mm_unpacklo_epi16(t[10], t[13]);
- u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
- s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
- s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_add_epi16(t[4], t[5]);
- s[5] = _mm_sub_epi16(t[4], t[5]);
- s[6] = _mm_sub_epi16(t[7], t[6]);
- s[7] = _mm_add_epi16(t[6], t[7]);
- s[8] = t[8];
- s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
- s[14] = _mm_packs_epi32(u[10], u[11]);
- s[10] = _mm_packs_epi32(u[12], u[13]);
- s[13] = _mm_packs_epi32(u[14], u[15]);
- s[11] = t[11];
- s[12] = t[12];
-
- // stage 5
- t[0] = _mm_add_epi16(s[0], s[3]);
- t[1] = _mm_add_epi16(s[1], s[2]);
- t[2] = _mm_sub_epi16(s[1], s[2]);
- t[3] = _mm_sub_epi16(s[0], s[3]);
- t[4] = s[4];
- t[7] = s[7];
-
- u[0] = _mm_sub_epi16(s[6], s[5]);
- u[1] = _mm_add_epi16(s[6], s[5]);
- t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
- t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-
- t[8] = _mm_add_epi16(s[8], s[11]);
- t[9] = _mm_add_epi16(s[9], s[10]);
- t[10] = _mm_sub_epi16(s[9], s[10]);
- t[11] = _mm_sub_epi16(s[8], s[11]);
- t[12] = _mm_sub_epi16(s[15], s[12]);
- t[13] = _mm_sub_epi16(s[14], s[13]);
- t[14] = _mm_add_epi16(s[13], s[14]);
- t[15] = _mm_add_epi16(s[12], s[15]);
-
- // stage 6
- if (round == 1) {
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_unpacklo_epi16(t[10], t[13]);
- u[1] = _mm_unpackhi_epi16(t[10], t[13]);
- u[2] = _mm_unpacklo_epi16(t[11], t[12]);
- u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- s[10] = _mm_packs_epi32(u[0], u[1]);
- s[13] = _mm_packs_epi32(u[2], u[3]);
- s[11] = _mm_packs_epi32(u[4], u[5]);
- s[12] = _mm_packs_epi32(u[6], u[7]);
- s[14] = t[14];
- s[15] = t[15];
- } else {
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_sub_epi16(t[13], t[10]);
- u[1] = _mm_add_epi16(t[13], t[10]);
- u[2] = _mm_sub_epi16(t[12], t[11]);
- u[3] = _mm_add_epi16(t[12], t[11]);
-
- s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
- s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
- s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
- s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
- s[14] = t[14];
- s[15] = t[15];
- }
-
- // stage 7
- in[0] = _mm_add_epi16(s[0], s[15]);
- in[1] = _mm_add_epi16(s[1], s[14]);
- in[2] = _mm_add_epi16(s[2], s[13]);
- in[3] = _mm_add_epi16(s[3], s[12]);
- in[4] = _mm_add_epi16(s[4], s[11]);
- in[5] = _mm_add_epi16(s[5], s[10]);
- in[6] = _mm_add_epi16(s[6], s[9]);
- in[7] = _mm_add_epi16(s[7], s[8]);
- in[8] = _mm_sub_epi16(s[7], s[8]);
- in[9] = _mm_sub_epi16(s[6], s[9]);
- in[10] = _mm_sub_epi16(s[5], s[10]);
- in[11] = _mm_sub_epi16(s[4], s[11]);
- in[12] = _mm_sub_epi16(s[3], s[12]);
- in[13] = _mm_sub_epi16(s[2], s[13]);
- in[14] = _mm_sub_epi16(s[1], s[14]);
- in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
- array_transpose_16x16(in0, in1);
- idct16_8col(in0, round);
- idct16_8col(in1, round);
-}
-
-void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
- int stride) {
- __m128i in0[16], in1[16];
-
- load_buffer_8x16(input, in0);
- input += 8;
- load_buffer_8x16(input, in1);
-
- idct16_sse2(in0, in1, 0);
- idct16_sse2(in0, in1, 1);
-
- write_buffer_8x16(dest, in0, stride);
- dest += 8;
- write_buffer_8x16(dest, in1, stride);
-}
-
-static void idct16_10_r1(__m128i *in, __m128i *l) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_01 = dual_set_epi16(3212, 32610);
- const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
- const __m128i stg3_01 = dual_set_epi16(6392, 32138);
- const __m128i stg4_01 = dual_set_epi16(23170, 23170);
-
-
-
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- __m128i stp1_0, stp1_1, stp1_4, stp1_6,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
-
- // Stage2
- {
- const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
- const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
-
- stp2_8 = _mm_mulhrs_epi16(lo_1_15, stg2_01);
- stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
- }
-
- // Stage3
- {
- const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
- stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
-
- stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
- stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
- }
-
- // Stage4
- {
- const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
- tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
- tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
-
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-
- stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
- stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
- stp2_9 = _mm_packs_epi32(tmp1, tmp3);
- stp2_10 = _mm_packs_epi32(tmp2, tmp4);
-
- stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
- }
-
- // Stage5 and Stage6
- {
- tmp0 = _mm_add_epi16(stp2_8, stp2_11);
- tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
- tmp2 = _mm_add_epi16(stp2_9, stp2_10);
- tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
- stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
- stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
- stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
- stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
- stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
- stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
- stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
- stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
- }
-
- // Stage6
- {
- const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
- const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
- const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
- const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
- const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
- const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
-
- tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
- tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
- tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
-
- stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
- tmp0 = _mm_mulhrs_epi16(tmp0, stg4_01);
- tmp4 = _mm_mulhrs_epi16(tmp4, stg4_01);
-
- stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
- stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
- stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
- stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
-
- tmp0 = _mm_add_epi16(stp1_0, stp1_4);
- tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
- tmp2 = _mm_add_epi16(stp1_1, stp1_6);
- tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
- stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
- stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
- stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
- stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
- stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
- stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
- stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage7. Left 8x16 only.
- l[0] = _mm_add_epi16(stp2_0, stp1_15);
- l[1] = _mm_add_epi16(stp2_1, stp1_14);
- l[2] = _mm_add_epi16(stp2_2, stp2_13);
- l[3] = _mm_add_epi16(stp2_3, stp2_12);
- l[4] = _mm_add_epi16(stp2_4, stp2_11);
- l[5] = _mm_add_epi16(stp2_5, stp2_10);
- l[6] = _mm_add_epi16(stp2_6, stp1_9);
- l[7] = _mm_add_epi16(stp2_7, stp1_8);
- l[8] = _mm_sub_epi16(stp2_7, stp1_8);
- l[9] = _mm_sub_epi16(stp2_6, stp1_9);
- l[10] = _mm_sub_epi16(stp2_5, stp2_10);
- l[11] = _mm_sub_epi16(stp2_4, stp2_11);
- l[12] = _mm_sub_epi16(stp2_3, stp2_12);
- l[13] = _mm_sub_epi16(stp2_2, stp2_13);
- l[14] = _mm_sub_epi16(stp2_1, stp1_14);
- l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-}
-
-static void idct16_10_r2(__m128i *in) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- const __m128i stg2_0 = dual_set_epi16(3212, 3212);
- const __m128i stg2_1 = dual_set_epi16(32610, 32610);
- const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
- const __m128i stg2_7 = dual_set_epi16(31358, 31358);
- const __m128i stg3_0 = dual_set_epi16(6392, 6392);
- const __m128i stg3_1 = dual_set_epi16(32138, 32138);
- const __m128i stg4_01 = dual_set_epi16(23170, 23170);
-
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- /* Stage2 */
- {
- stp1_8_0 = _mm_mulhrs_epi16(in[1], stg2_0);
- stp1_15 = _mm_mulhrs_epi16(in[1], stg2_1);
- stp1_11 = _mm_mulhrs_epi16(in[3], stg2_6);
- stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
- }
-
- /* Stage3 */
- {
- stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
- stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
-
- stp1_9 = stp1_8_0;
- stp1_10 = stp1_11;
-
- stp1_13 = stp1_12_0;
- stp1_14 = stp1_15;
- }
-
- /* Stage4 */
- {
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
- stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
-
- stp2_5 = stp2_4;
- stp2_6 = stp2_7;
-
-
- tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
- tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
- tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
- tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
- tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, 14);
- tmp1 = _mm_srai_epi32(tmp1, 14);
- tmp2 = _mm_srai_epi32(tmp2, 14);
- tmp3 = _mm_srai_epi32(tmp3, 14);
- tmp4 = _mm_srai_epi32(tmp4, 14);
- tmp5 = _mm_srai_epi32(tmp5, 14);
- tmp6 = _mm_srai_epi32(tmp6, 14);
- tmp7 = _mm_srai_epi32(tmp7, 14);
-
- stp2_9 = _mm_packs_epi32(tmp0, tmp1);
- stp2_14 = _mm_packs_epi32(tmp2, tmp3);
- stp2_10 = _mm_packs_epi32(tmp4, tmp5);
- stp2_13 = _mm_packs_epi32(tmp6, tmp7);
- }
-
- /* Stage5 */
- {
- stp1_2 = stp1_0;
- stp1_3 = stp1_0;
-
- tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
- tmp1 = _mm_add_epi16(stp2_6, stp2_5);
-
- stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
- stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
-
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
-
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
- }
-
- /* Stage6 */
- {
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
- stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
-
- tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
- tmp1 = _mm_add_epi16(stp1_13, stp1_10);
- tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
- tmp3 = _mm_add_epi16(stp1_12, stp1_11);
-
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
-
- stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
- stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
- stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
- stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
- }
-
- // Stage7
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-}
-
-void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
- const __m128i zero = _mm_setzero_si128();
- __m128i in[16], l[16];
-
- int i;
- // First 1-D inverse DCT
- // Load input data.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
- TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
- idct16_10_r1(in, l);
-
- // Second 1-D inverse transform, performed per 8x16 block
- for (i = 0; i < 2; i++) {
- array_transpose_4X8(l + 8*i, in);
-
- idct16_10_r2(in);
-
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest, in[0]);
- RECON_AND_STORE(dest, in[1]);
- RECON_AND_STORE(dest, in[2]);
- RECON_AND_STORE(dest, in[3]);
- RECON_AND_STORE(dest, in[4]);
- RECON_AND_STORE(dest, in[5]);
- RECON_AND_STORE(dest, in[6]);
- RECON_AND_STORE(dest, in[7]);
- RECON_AND_STORE(dest, in[8]);
- RECON_AND_STORE(dest, in[9]);
- RECON_AND_STORE(dest, in[10]);
- RECON_AND_STORE(dest, in[11]);
- RECON_AND_STORE(dest, in[12]);
- RECON_AND_STORE(dest, in[13]);
- RECON_AND_STORE(dest, in[14]);
- RECON_AND_STORE(dest, in[15]);
-
- dest += 8 - (stride * 16);
- }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
index 69b07f64575..22b5731886c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -15,6 +15,11 @@ pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_16: times 8 dw 16
pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
+pw2_32: times 8 dw 16
SECTION .text
@@ -40,6 +45,46 @@ cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
RET
INIT_MMX sse
+cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
@@ -68,6 +113,91 @@ cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
RESTORE_GOT
RET
+INIT_MMX sse
+cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
INIT_XMM sse2
cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
@@ -100,6 +230,91 @@ cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
RESTORE_GOT
REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ pxor m2, m2
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ pxor m2, m2
+ mova m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
INIT_XMM sse2
cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
GET_GOT goffsetq
@@ -142,6 +357,101 @@ cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
RESTORE_GOT
REP_RET
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ mova m2, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
INIT_MMX sse
cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
movd m0, [aboveq]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
index 439c028f29d..0cb0912ad62 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -9,6 +9,7 @@
*/
#include <immintrin.h> /* AVX2 */
+#include "vpx_ports/mem.h"
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
const unsigned char *_blimit, const unsigned char *_limit,
@@ -392,6 +393,11 @@ static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
}
}
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
const unsigned char *_blimit, const unsigned char *_limit,
const unsigned char *_thresh) {
@@ -401,6 +407,9 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
__m128i p7, p6, p5;
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
__m128i q5, q6, q7;
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+ q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+ p256_0, q256_0;
const __m128i thresh = _mm_broadcastb_epi8(
_mm_cvtsi32_si128((int) _thresh[0]));
@@ -409,16 +418,37 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
const __m128i blimit = _mm_broadcastb_epi8(
_mm_cvtsi32_si128((int) _blimit[0]));
- p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
- p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
- q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+ p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 5 * p)));
+ p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 4 * p)));
+ p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 3 * p)));
+ p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 2 * p)));
+ p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 1 * p)));
+ q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 0 * p)));
+ q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 1 * p)));
+ q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 2 * p)));
+ q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 3 * p)));
+ q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 4 * p)));
+
+ p4 = _mm256_castsi256_si128(p256_4);
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+ q4 = _mm256_castsi256_si128(q256_4);
{
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
@@ -534,23 +564,35 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
- p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
- q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+ p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 6 * p)));
+ q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 5 * p)));
+ p5 = _mm256_castsi256_si128(p256_5);
+ q5 = _mm256_castsi256_si128(q256_5);
flat2 = _mm_max_epu8(
_mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
_mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
flat2 = _mm_max_epu8(work, flat2);
- p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
- q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+ p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 7 * p)));
+ q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 6 * p)));
+ p6 = _mm256_castsi256_si128(p256_6);
+ q6 = _mm256_castsi256_si128(q256_6);
work = _mm_max_epu8(
_mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
_mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
flat2 = _mm_max_epu8(work, flat2);
- p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
- q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+ p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s - 8 * p)));
+ q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+ (__m128d const *)(s + 7 * p)));
+ p7 = _mm256_castsi256_si128(p256_7);
+ q7 = _mm256_castsi256_si128(q256_7);
work = _mm_max_epu8(
_mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
_mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
@@ -566,29 +608,28 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
{
const __m256i eight = _mm256_set1_epi16(8);
const __m256i four = _mm256_set1_epi16(4);
- __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
- q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
- p256_0, q256_0;
__m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
res_q;
- p256_7 = _mm256_cvtepu8_epi16(p7);
- p256_6 = _mm256_cvtepu8_epi16(p6);
- p256_5 = _mm256_cvtepu8_epi16(p5);
- p256_4 = _mm256_cvtepu8_epi16(p4);
- p256_3 = _mm256_cvtepu8_epi16(p3);
- p256_2 = _mm256_cvtepu8_epi16(p2);
- p256_1 = _mm256_cvtepu8_epi16(p1);
- p256_0 = _mm256_cvtepu8_epi16(p0);
- q256_0 = _mm256_cvtepu8_epi16(q0);
- q256_1 = _mm256_cvtepu8_epi16(q1);
- q256_2 = _mm256_cvtepu8_epi16(q2);
- q256_3 = _mm256_cvtepu8_epi16(q3);
- q256_4 = _mm256_cvtepu8_epi16(q4);
- q256_5 = _mm256_cvtepu8_epi16(q5);
- q256_6 = _mm256_cvtepu8_epi16(q6);
- q256_7 = _mm256_cvtepu8_epi16(q7);
+ const __m256i filter = _mm256_load_si256(
+ (__m256i const *)filt_loopfilter_avx2);
+ p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+ q256_7 = _mm256_shuffle_epi8(q256_7, filter);
pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
_mm256_add_epi16(p256_4, p256_3));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 320328e2129..8723d32836d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -729,12 +729,12 @@ void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh, int count) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
const __m128i zero = _mm_set1_epi16(0);
const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
const __m128i limit = _mm_load_si128((const __m128i *)_limit);
@@ -948,12 +948,12 @@ void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
const uint8_t *_blimit1,
const uint8_t *_limit1,
const uint8_t *_thresh1) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
- DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
const __m128i zero = _mm_set1_epi16(0);
const __m128i blimit =
_mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
@@ -1461,7 +1461,7 @@ void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
unsigned char *src[2];
unsigned char *dst[2];
@@ -1484,7 +1484,7 @@ void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh, int count) {
- DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
unsigned char *src[1];
unsigned char *dst[1];
(void)count;
@@ -1511,7 +1511,7 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
unsigned char *src[2];
unsigned char *dst[2];
@@ -1535,7 +1535,7 @@ void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
- DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+ DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
unsigned char *src[2];
unsigned char *dst[2];
@@ -1562,7 +1562,7 @@ void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
// Transpose 16x16
transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
index 91055b9f9d4..f5f7d5af784 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -601,9 +601,6 @@ align 16
t80:
times 8 db 0x80
align 16
-t1s:
- times 8 db 0x01
-align 16
t3:
times 8 db 0x03
align 16
@@ -612,15 +609,3 @@ t4:
align 16
ones:
times 4 dw 0x0001
-align 16
-s27:
- times 4 dw 0x1b00
-align 16
-s18:
- times 4 dw 0x1200
-align 16
-s9:
- times 4 dw 0x0900
-align 16
-s63:
- times 4 dw 0x003f
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 00000000000..6029420d114
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; This file is a duplicate of mfqe_sse2.asm in VP8.
+; TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+ ddq 128
+%elif CONFIG_BIG_ENDIAN
+ dq 0, 128
+%else
+ dq 128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index c4efa6565f3..71dbb402dd4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -312,9 +312,11 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+ __m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -333,27 +335,26 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ // load the first 7 rows of 8 bytes
+ srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+ srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
+ srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
+ srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
+ srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
+ srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
+ srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+
for (i = 0; i < output_height; i++) {
- // load the first 8 bytes
- srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
- srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+ // load the last 8 bytes
+ srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
// merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
- srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
- srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+ srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
// merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+ srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -377,6 +378,15 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
src_ptr+=src_pitch;
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
@@ -390,9 +400,11 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+ __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -411,19 +423,24 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ // load the first 7 rows of 16 bytes
+ srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
+ srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+
for (i = 0; i < output_height; i++) {
- // load the first 16 bytes
- srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
- // load the next 16 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
- srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+ // load the last 16 bytes
+ srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
// merge the result together
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
- srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+ srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
@@ -435,25 +452,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- // load the next 16 bytes in stride of two/three src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
// merge the result together
- srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
- // load the next 16 bytes in stride of four/five src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
// merge the result together
- srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
@@ -461,13 +470,13 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+ _mm_min_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_min_epi16(srcRegFilt6, srcRegFilt8));
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+ _mm_max_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_max_epi16(srcRegFilt6, srcRegFilt8));
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
@@ -484,6 +493,15 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
src_ptr+=src_pitch;
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
// save 16 bytes convolve result
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index fd781d4bc6a..4a5bf1b6003 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -18,7 +18,7 @@
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
+ movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
@@ -661,7 +661,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
+ movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
@@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movq xmm0, [rsi - 3] ;load src data
movq xmm4, [rsi + 5]
- movq xmm7, [rsi + 13]
+ movq xmm6, [rsi + 13]
punpcklqdq xmm0, xmm4
- punpcklqdq xmm4, xmm7
+ punpcklqdq xmm4, xmm6
+
+ movdqa xmm7, xmm0
+ punpcklbw xmm7, xmm7
+ punpckhbw xmm0, xmm0
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
- movdqa xmm5, xmm4
- movdqa xmm6, xmm4
- movdqa xmm7, xmm4
-
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pshufb xmm3, [GLOBAL(shuf_t6t7)]
- pshufb xmm4, [GLOBAL(shuf_t0t1)]
- pshufb xmm5, [GLOBAL(shuf_t2t3)]
- pshufb xmm6, [GLOBAL(shuf_t4t5)]
- pshufb xmm7, [GLOBAL(shuf_t6t7)]
+ palignr xmm0, xmm7, 1
+ palignr xmm1, xmm7, 5
pmaddubsw xmm0, k0k1
+ palignr xmm2, xmm7, 9
pmaddubsw xmm1, k2k3
+ palignr xmm3, xmm7, 13
+
pmaddubsw xmm2, k4k5
pmaddubsw xmm3, k6k7
- pmaddubsw xmm4, k0k1
- pmaddubsw xmm5, k2k3
- pmaddubsw xmm6, k4k5
- pmaddubsw xmm7, k6k7
-
paddsw xmm0, xmm3
+
+ movdqa xmm3, xmm4
+ punpcklbw xmm3, xmm3
+ punpckhbw xmm4, xmm4
+
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+
+ palignr xmm4, xmm3, 1
+ palignr xmm5, xmm3, 5
+ palignr xmm6, xmm3, 9
+ palignr xmm7, xmm3, 13
+
movdqa xmm3, xmm1
+ pmaddubsw xmm4, k0k1
pmaxsw xmm1, xmm2
+ pmaddubsw xmm5, k2k3
pminsw xmm2, xmm3
+ pmaddubsw xmm6, k4k5
paddsw xmm0, xmm2
+ pmaddubsw xmm7, k6k7
paddsw xmm0, xmm1
paddsw xmm4, xmm7
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
index dc712f04500..eb9b7971074 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -23,6 +23,7 @@
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconintra.h"
@@ -36,7 +37,6 @@
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
#include "vp9/decoder/vp9_reader.h"
@@ -127,7 +127,7 @@ static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
}
static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
- FRAME_CONTEXT *const fc = &cm->fc;
+ FRAME_CONTEXT *const fc = cm->fc;
int i;
if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -181,14 +181,6 @@ static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
}
}
-static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
- int i;
- xd->plane[0].dequant = cm->y_dequant[q_index];
-
- for (i = 1; i < MAX_MB_PLANE; i++)
- xd->plane[i].dequant = cm->uv_dequant[q_index];
-}
-
static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
TX_SIZE tx_size, uint8_t *dst, int stride,
int eob) {
@@ -284,14 +276,14 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
#endif // CONFIG_VP9_HIGHBITDEPTH
if (eob == 1) {
- vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+ memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
} else {
if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
- vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
else if (tx_size == TX_32X32 && eob <= 34)
- vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+ memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
else
- vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+ memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
}
}
}
@@ -299,7 +291,9 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
struct intra_args {
VP9_COMMON *cm;
MACROBLOCKD *xd;
+ FRAME_COUNTS *counts;
vp9_reader *r;
+ int seg_id;
};
static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -309,7 +303,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
VP9_COMMON *const cm = args->cm;
MACROBLOCKD *const xd = args->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
- MODE_INFO *const mi = xd->mi[0].src_mi;
+ MODE_INFO *const mi = xd->mi[0];
const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
: mi->mbmi.uv_mode;
int x, y;
@@ -323,9 +317,9 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
x, y, plane);
if (!mi->mbmi.skip) {
- const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
+ const int eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block,
plane_bsize, x, y, tx_size,
- args->r);
+ args->r, args->seg_id);
inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
eob);
}
@@ -335,7 +329,9 @@ struct inter_args {
VP9_COMMON *cm;
MACROBLOCKD *xd;
vp9_reader *r;
+ FRAME_COUNTS *counts;
int *eobtotal;
+ int seg_id;
};
static void reconstruct_inter_block(int plane, int block,
@@ -347,8 +343,8 @@ static void reconstruct_inter_block(int plane, int block,
struct macroblockd_plane *const pd = &xd->plane[plane];
int x, y, eob;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
- eob = vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y,
- tx_size, args->r);
+ eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block, plane_bsize,
+ x, y, tx_size, args->r, args->seg_id);
inverse_transform_block(xd, plane, block, tx_size,
&pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
pd->dst.stride, eob);
@@ -365,13 +361,12 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const int offset = mi_row * cm->mi_stride + mi_col;
int x, y;
- xd->mi = cm->mi + offset;
- xd->mi[0].src_mi = &xd->mi[0]; // Point to self.
- xd->mi[0].mbmi.sb_type = bsize;
-
+ xd->mi = cm->mi_grid_visible + offset;
+ xd->mi[0] = &cm->mi[offset];
+ xd->mi[0]->mbmi.sb_type = bsize;
for (y = 0; y < y_mis; ++y)
for (x = !y; x < x_mis; ++x) {
- xd->mi[y * cm->mi_stride + x].src_mi = &xd->mi[0];
+ xd->mi[y * cm->mi_stride + x] = xd->mi[0];
}
set_skip_context(xd, mi_row, mi_col);
@@ -381,40 +376,38 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
- return &xd->mi[0].mbmi;
+ return &xd->mi[0]->mbmi;
}
-static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader *r, BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &pbi->common;
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
- vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+ vp9_read_mode_info(pbi, xd, counts, tile, mi_row, mi_col, r);
if (less8x8)
bsize = BLOCK_8X8;
if (mbmi->skip) {
reset_skip_context(xd, bsize);
- } else {
- if (cm->seg.enabled)
- setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
- cm->base_qindex));
}
if (!is_inter_block(mbmi)) {
- struct intra_args arg = { cm, xd, r };
+ struct intra_args arg = {cm, xd, counts, r, mbmi->segment_id};
vp9_foreach_transformed_block(xd, bsize,
predict_and_reconstruct_intra_block, &arg);
} else {
// Prediction
- vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
// Reconstruction
if (!mbmi->skip) {
int eobtotal = 0;
- struct inter_args arg = { cm, xd, r, &eobtotal };
+ struct inter_args arg = {cm, xd, r, counts, &eobtotal, mbmi->segment_id};
vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
if (!less8x8 && eobtotal == 0)
mbmi->skip = 1; // skip loopfilter
@@ -424,7 +417,8 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
xd->corrupted |= vp9_reader_has_error(r);
}
-static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts, int hbs,
int mi_row, int mi_col, BLOCK_SIZE bsize,
vp9_reader *r) {
const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -443,15 +437,17 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
p = PARTITION_SPLIT;
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.partition[ctx][p];
+ ++counts->partition[ctx][p];
return p;
}
-static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader* r, BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &pbi->common;
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize, uv_subsize;
@@ -459,34 +455,37 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
+ partition = read_partition(cm, xd, counts, hbs, mi_row, mi_col, bsize, r);
subsize = get_subsize(bsize, partition);
uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Invalid block size.");
+ vpx_internal_error(xd->error_info,
+ VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
if (subsize < BLOCK_8X8) {
- decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
} else {
switch (partition) {
case PARTITION_NONE:
- decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
break;
case PARTITION_HORZ:
- decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
if (mi_row + hbs < cm->mi_rows)
- decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+ decode_block(pbi, xd, counts, tile, mi_row + hbs, mi_col, r, subsize);
break;
case PARTITION_VERT:
- decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
if (mi_col + hbs < cm->mi_cols)
- decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+ decode_block(pbi, xd, counts, tile, mi_row, mi_col + hbs, r, subsize);
break;
case PARTITION_SPLIT:
- decode_partition(cm, xd, tile, mi_row, mi_col, r, subsize);
- decode_partition(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
- decode_partition(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
- decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+ decode_partition(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
+ decode_partition(pbi, xd, counts, tile, mi_row, mi_col + hbs, r,
+ subsize);
+ decode_partition(pbi, xd, counts, tile, mi_row + hbs, mi_col, r,
+ subsize);
+ decode_partition(pbi, xd, counts, tile, mi_row + hbs, mi_col + hbs, r,
+ subsize);
break;
default:
assert(0 && "Invalid partition type");
@@ -617,34 +616,54 @@ static void setup_loopfilter(struct loopfilter *lf,
}
}
-static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
- const int old = *delta_q;
- *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
- return old != *delta_q;
+static INLINE int read_delta_q(struct vp9_read_bit_buffer *rb) {
+ return vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
}
static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
struct vp9_read_bit_buffer *rb) {
- int update = 0;
-
cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
- update |= read_delta_q(rb, &cm->y_dc_delta_q);
- update |= read_delta_q(rb, &cm->uv_dc_delta_q);
- update |= read_delta_q(rb, &cm->uv_ac_delta_q);
- if (update || cm->bit_depth != cm->dequant_bit_depth) {
- vp9_init_dequantizer(cm);
- cm->dequant_bit_depth = cm->bit_depth;
- }
-
+ cm->y_dc_delta_q = read_delta_q(rb);
+ cm->uv_dc_delta_q = read_delta_q(rb);
+ cm->uv_ac_delta_q = read_delta_q(rb);
+ cm->dequant_bit_depth = cm->bit_depth;
xd->lossless = cm->base_qindex == 0 &&
cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
+
#if CONFIG_VP9_HIGHBITDEPTH
xd->bd = (int)cm->bit_depth;
#endif
}
+static void setup_segmentation_dequant(VP9_COMMON *const cm) {
+ // Build y/uv dequant values based on segmentation.
+ if (cm->seg.enabled) {
+ int i;
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = vp9_get_qindex(&cm->seg, i, cm->base_qindex);
+ cm->y_dequant[i][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q,
+ cm->bit_depth);
+ cm->y_dequant[i][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+ cm->uv_dequant[i][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+ cm->bit_depth);
+ cm->uv_dequant[i][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+ cm->bit_depth);
+ }
+ } else {
+ const int qindex = cm->base_qindex;
+ // When segmentation is disabled, only the first value is used. The
+ // remaining are don't cares.
+ cm->y_dequant[0][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+ cm->y_dequant[0][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+ cm->uv_dequant[0][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+ cm->bit_depth);
+ cm->uv_dequant[0][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+ cm->bit_depth);
+ }
+}
+
static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
EIGHTTAP,
@@ -667,6 +686,14 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
}
+static void resize_mv_buffer(VP9_COMMON *cm) {
+ vpx_free(cm->cur_frame->mvs);
+ cm->cur_frame->mi_rows = cm->mi_rows;
+ cm->cur_frame->mi_cols = cm->mi_cols;
+ cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(*cm->cur_frame->mvs));
+}
+
static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
#if CONFIG_SIZE_LIMIT
if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
@@ -692,14 +719,20 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
cm->width = width;
cm->height = height;
}
+ if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+ cm->mi_cols > cm->cur_frame->mi_cols) {
+ resize_mv_buffer(cm);
+ }
}
static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
int width, height;
+ BufferPool *const pool = cm->buffer_pool;
vp9_read_frame_size(rb, &width, &height);
resize_context_buffers(cm, width, height);
setup_display_size(cm, rb);
+ lock_buffer_pool(pool);
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -707,14 +740,19 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
cm->use_highbitdepth,
#endif
VP9_DEC_BORDER_IN_PIXELS,
- &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
- cm->cb_priv)) {
+ cm->byte_alignment,
+ &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+ pool->cb_priv)) {
+ unlock_buffer_pool(pool);
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
- cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
- cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
- cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+ unlock_buffer_pool(pool);
+
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+ pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
}
static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
@@ -730,15 +768,12 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
int width, height;
int found = 0, i;
int has_valid_ref_frame = 0;
+ BufferPool *const pool = cm->buffer_pool;
for (i = 0; i < REFS_PER_FRAME; ++i) {
if (vp9_rb_read_bit(rb)) {
YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
width = buf->y_crop_width;
height = buf->y_crop_height;
- if (buf->corrupted) {
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Frame reference is corrupt");
- }
found = 1;
break;
}
@@ -772,12 +807,13 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
cm->subsampling_x,
cm->subsampling_y))
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Referenced frame has incompatible color space");
+ "Referenced frame has incompatible color format");
}
resize_context_buffers(cm, width, height);
setup_display_size(cm, rb);
+ lock_buffer_pool(pool);
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -785,14 +821,19 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
cm->use_highbitdepth,
#endif
VP9_DEC_BORDER_IN_PIXELS,
- &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
- cm->cb_priv)) {
+ cm->byte_alignment,
+ &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+ pool->cb_priv)) {
+ unlock_buffer_pool(pool);
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
- cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
- cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
- cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+ unlock_buffer_pool(pool);
+
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+ pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
}
static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -902,12 +943,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
// Be sure to sync as we might be resuming after a failed frame decode.
winterface->sync(&pbi->lf_worker);
- lf_data->frame_buffer = get_frame_new_buffer(cm);
- lf_data->cm = cm;
- vp9_copy(lf_data->planes, pbi->mb.plane);
- lf_data->stop = 0;
- lf_data->y_only = 0;
- vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+ vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+ pbi->mb.plane);
}
assert(tile_rows <= 4);
@@ -915,11 +952,11 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context, 0,
- sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+ memset(cm->above_context, 0,
+ sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
- vpx_memset(cm->above_seg_context, 0,
- sizeof(*cm->above_seg_context) * aligned_cols);
+ memset(cm->above_seg_context, 0,
+ sizeof(*cm->above_seg_context) * aligned_cols);
get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
@@ -947,7 +984,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
init_macroblockd(cm, &tile_data->xd);
- vp9_zero(tile_data->xd.dqcoeff);
}
}
@@ -965,13 +1001,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
- &tile_data->bit_reader, BLOCK_64X64);
+ decode_partition(pbi, &tile_data->xd, &cm->counts, &tile, mi_row,
+ mi_col, &tile_data->bit_reader, BLOCK_64X64);
}
pbi->mb.corrupted |= tile_data->xd.corrupted;
+ if (pbi->mb.corrupted)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
}
// Loopfilter one row.
- if (cm->lf.filter_level && !pbi->mb.corrupted) {
+ if (cm->lf.filter_level) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
@@ -990,11 +1029,17 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
winterface->execute(&pbi->lf_worker);
}
}
+ // After loopfiltering, the last 7 row pixels in each superblock row may
+ // still be changed by the longest loopfilter of the next superblock
+ // row.
+ if (pbi->frame_parallel_decode)
+ vp9_frameworker_broadcast(pbi->cur_buf,
+ mi_row << MI_BLOCK_SIZE_LOG2);
}
}
// Loopfilter remaining rows in the frame.
- if (cm->lf.filter_level && !pbi->mb.corrupted) {
+ if (cm->lf.filter_level) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
winterface->sync(&pbi->lf_worker);
lf_data->start = lf_data->stop;
@@ -1005,6 +1050,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
// Get last tile data.
tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
+ if (pbi->frame_parallel_decode)
+ vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
return vp9_reader_find_end(&tile_data->bit_reader);
}
@@ -1012,14 +1059,24 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
const TileInfo *const tile) {
int mi_row, mi_col;
+ if (setjmp(tile_data->error_info.jmp)) {
+ tile_data->error_info.setjmp = 0;
+ tile_data->xd.corrupted = 1;
+ return 0;
+ }
+
+ tile_data->error_info.setjmp = 1;
+ tile_data->xd.error_info = &tile_data->error_info;
+
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
vp9_zero(tile_data->xd.left_context);
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- decode_partition(tile_data->cm, &tile_data->xd, tile,
- mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+ decode_partition(tile_data->pbi, &tile_data->xd, &tile_data->counts,
+ tile, mi_row, mi_col, &tile_data->bit_reader,
+ BLOCK_64X64);
}
}
return !tile_data->xd.corrupted;
@@ -1029,13 +1086,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
static int compare_tile_buffers(const void *a, const void *b) {
const TileBuffer *const buf1 = (const TileBuffer*)a;
const TileBuffer *const buf2 = (const TileBuffer*)b;
- if (buf1->size < buf2->size) {
- return 1;
- } else if (buf1->size == buf2->size) {
- return 0;
- } else {
- return -1;
- }
+ return (int)(buf2->size - buf1->size);
}
static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
@@ -1065,14 +1116,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
// use num_threads - 1 workers.
CHECK_MEM_ERROR(cm, pbi->tile_workers,
vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+ // Ensure tile data offsets will be properly aligned. This may fail on
+ // platforms without DECLARE_ALIGNED().
+ assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+ CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+ vpx_memalign(32, num_threads *
+ sizeof(*pbi->tile_worker_data)));
+ CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+ vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
for (i = 0; i < num_threads; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
++pbi->num_tile_workers;
winterface->init(worker);
- CHECK_MEM_ERROR(cm, worker->data1,
- vpx_memalign(32, sizeof(TileWorkerData)));
- CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
if (i < num_threads - 1 && !winterface->reset(worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Tile decoder thread creation failed");
@@ -1082,16 +1138,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
- winterface->sync(&pbi->tile_workers[n]);
- pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+ VP9Worker *const worker = &pbi->tile_workers[n];
+ winterface->sync(worker);
+ worker->hook = (VP9WorkerHook)tile_worker_hook;
+ worker->data1 = &pbi->tile_worker_data[n];
+ worker->data2 = &pbi->tile_worker_info[n];
}
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context, 0,
- sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
- vpx_memset(cm->above_seg_context, 0,
- sizeof(*cm->above_seg_context) * aligned_mi_cols);
+ memset(cm->above_context, 0,
+ sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+ memset(cm->above_seg_context, 0,
+ sizeof(*cm->above_seg_context) * aligned_mi_cols);
// Load tile data into tile_buffers
get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
@@ -1116,6 +1175,17 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
}
}
+ // Initialize thread frame counts.
+ if (!cm->frame_parallel_decoding_mode) {
+ int i;
+
+ for (i = 0; i < num_workers; ++i) {
+ TileWorkerData *const tile_data =
+ (TileWorkerData*)pbi->tile_workers[i].data1;
+ vp9_zero(tile_data->counts);
+ }
+ }
+
n = 0;
while (n < tile_cols) {
int i;
@@ -1125,15 +1195,14 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
TileInfo *const tile = (TileInfo*)worker->data2;
TileBuffer *const buf = &tile_buffers[0][n];
- tile_data->cm = cm;
+ tile_data->pbi = pbi;
tile_data->xd = pbi->mb;
tile_data->xd.corrupted = 0;
- vp9_tile_init(tile, tile_data->cm, 0, buf->col);
+ vp9_tile_init(tile, cm, 0, buf->col);
setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
init_macroblockd(cm, &tile_data->xd);
- vp9_zero(tile_data->xd.dqcoeff);
worker->had_error = 0;
if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1151,6 +1220,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
for (; i > 0; --i) {
VP9Worker *const worker = &pbi->tile_workers[i - 1];
+ // TODO(jzern): The tile may have specific error data associated with
+ // its vpx_internal_error_info which could be propagated to the main info
+ // in cm. Additionally once the threads have been synced and an error is
+ // detected, there's no point in continuing to decode tiles.
pbi->mb.corrupted |= !winterface->sync(worker);
}
if (final_worker > -1) {
@@ -1159,6 +1232,15 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
final_worker = -1;
}
+
+ // Accumulate thread frame counts.
+ if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
+ for (i = 0; i < num_workers; ++i) {
+ TileWorkerData *const tile_data =
+ (TileWorkerData*)pbi->tile_workers[i].data1;
+ vp9_accumulate_frame_counts(cm, &tile_data->counts, 1);
+ }
+ }
}
return bit_reader_end;
@@ -1196,8 +1278,8 @@ static void read_bitdepth_colorspace_sampling(
cm->use_highbitdepth = 0;
#endif
}
- cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
- if (cm->color_space != SRGB) {
+ cm->color_space = vp9_rb_read_literal(rb, 3);
+ if (cm->color_space != VPX_CS_SRGB) {
vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range
if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -1229,8 +1311,10 @@ static void read_bitdepth_colorspace_sampling(
static size_t read_uncompressed_header(VP9Decoder *pbi,
struct vp9_read_bit_buffer *rb) {
VP9_COMMON *const cm = &pbi->common;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ BufferPool *const pool = pbi->common.buffer_pool;
+ int i, mask, ref_index = 0;
size_t sz;
- int i;
cm->last_frame_type = cm->frame_type;
@@ -1248,16 +1332,24 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
if (cm->show_existing_frame) {
// Show an existing frame directly.
const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
-
- if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
+ lock_buffer_pool(pool);
+ if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+ unlock_buffer_pool(pool);
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Buffer %d does not contain a decoded frame",
frame_to_show);
+ }
- ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
+ ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+ unlock_buffer_pool(pool);
pbi->refresh_frame_flags = 0;
cm->lf.filter_level = 0;
cm->show_frame = 1;
+
+ if (pbi->frame_parallel_decode) {
+ for (i = 0; i < REF_FRAMES; ++i)
+ cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+ }
return 0;
}
@@ -1274,12 +1366,15 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
for (i = 0; i < REFS_PER_FRAME; ++i) {
- cm->frame_refs[i].idx = -1;
+ cm->frame_refs[i].idx = INVALID_IDX;
cm->frame_refs[i].buf = NULL;
}
setup_frame_size(cm, rb);
- pbi->need_resync = 0;
+ if (pbi->need_resync) {
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ pbi->need_resync = 0;
+ }
} else {
cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
@@ -1295,9 +1390,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
} else {
// NOTE: The intra-only frame header does not include the specification
// of either the color format or color sub-sampling in profile 0. VP9
- // specifies that the default color space should be YUV 4:2:0 in this
+ // specifies that the default color format should be YUV 4:2:0 in this
// case (normative).
- cm->color_space = BT_601;
+ cm->color_space = VPX_CS_BT_601;
cm->subsampling_y = cm->subsampling_x = 1;
cm->bit_depth = VPX_BITS_8;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -1307,15 +1402,18 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
setup_frame_size(cm, rb);
- pbi->need_resync = 0;
- } else {
+ if (pbi->need_resync) {
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ pbi->need_resync = 0;
+ }
+ } else if (pbi->need_resync != 1) { /* Skip if need resync */
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
for (i = 0; i < REFS_PER_FRAME; ++i) {
const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
const int idx = cm->ref_frame_map[ref];
RefBuffer *const ref_frame = &cm->frame_refs[i];
ref_frame->idx = idx;
- ref_frame->buf = &cm->frame_bufs[idx].buf;
+ ref_frame->buf = &frame_bufs[idx].buf;
cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
}
@@ -1338,14 +1436,13 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
ref_buf->buf->y_crop_height,
cm->width, cm->height);
#endif
- if (vp9_is_scaled(&ref_buf->sf))
- vp9_extend_frame_borders(ref_buf->buf);
}
}
}
#if CONFIG_VP9_HIGHBITDEPTH
get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
#endif
+ get_frame_new_buffer(cm)->color_space = cm->color_space;
if (pbi->need_resync) {
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1365,12 +1462,37 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
// below, forcing the use of context 0 for those frame types.
cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+ // Generate next_ref_frame_map.
+ lock_buffer_pool(pool);
+ for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ if (mask & 1) {
+ cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+ ++frame_bufs[cm->new_fb_idx].ref_count;
+ } else {
+ cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+ }
+ // Current thread holds the reference frame.
+ if (cm->ref_frame_map[ref_index] >= 0)
+ ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+ ++ref_index;
+ }
+
+ for (; ref_index < REF_FRAMES; ++ref_index) {
+ cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+ // Current thread holds the reference frame.
+ if (cm->ref_frame_map[ref_index] >= 0)
+ ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+ }
+ unlock_buffer_pool(pool);
+ pbi->hold_ref_buf = 1;
+
if (frame_is_intra_only(cm) || cm->error_resilient_mode)
vp9_setup_past_independence(cm);
setup_loopfilter(&cm->lf, rb);
setup_quantization(cm, &pbi->mb, rb);
setup_segmentation(&cm->seg, rb);
+ setup_segmentation_dequant(cm);
setup_tile_info(cm, rb);
sz = vp9_rb_read_literal(rb, 16);
@@ -1386,7 +1508,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
size_t partition_size) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- FRAME_CONTEXT *const fc = &cm->fc;
+ FRAME_CONTEXT *const fc = cm->fc;
vp9_reader r;
int k;
@@ -1434,18 +1556,6 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
return vp9_reader_has_error(&r);
}
-void vp9_init_dequantizer(VP9_COMMON *cm) {
- int q;
-
- for (q = 0; q < QINDEX_RANGE; q++) {
- cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth);
- cm->y_dequant[q][1] = vp9_ac_quant(q, 0, cm->bit_depth);
-
- cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth);
- cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
- }
-}
-
#ifdef NDEBUG
#define debug_check_frame_counts(cm) (void)0
#else // !NDEBUG
@@ -1510,7 +1620,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
-
+ int context_updated = 0;
uint8_t clear_data[MAX_VP9_HEADER_SIZE];
const size_t first_partition_size = read_uncompressed_header(pbi,
init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
@@ -1530,40 +1640,68 @@ void vp9_decode_frame(VP9Decoder *pbi,
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- init_macroblockd(cm, &pbi->mb);
-
- if (!cm->error_resilient_mode)
- set_prev_mi(cm);
- else
- cm->prev_mi = NULL;
+ cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+ cm->width == cm->last_width &&
+ cm->height == cm->last_height &&
+ !cm->intra_only &&
+ cm->last_show_frame;
- setup_plane_dequants(cm, xd, cm->base_qindex);
vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
- cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ if (!cm->fc->initialized)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+
vp9_zero(cm->counts);
- vp9_zero(xd->dqcoeff);
xd->corrupted = 0;
new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+ if (new_fb->corrupted)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data header is corrupted.");
+
+ if (cm->lf.filter_level) {
+ vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+ }
- // TODO(jzern): remove frame_parallel_decoding_mode restriction for
- // single-frame tile decoding.
- if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
- cm->frame_parallel_decoding_mode) {
+ // If encoded in frame parallel mode, frame context is ready after decoding
+ // the frame header.
+ if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+ VP9Worker *const worker = pbi->frame_worker_owner;
+ FrameWorkerData *const frame_worker_data = worker->data1;
+ if (cm->refresh_frame_context) {
+ context_updated = 1;
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+ }
+ vp9_frameworker_lock_stats(worker);
+ pbi->cur_buf->row = -1;
+ pbi->cur_buf->col = -1;
+ frame_worker_data->frame_context_ready = 1;
+ // Signal the main thread that context is ready.
+ vp9_frameworker_signal_stats(worker);
+ vp9_frameworker_unlock_stats(worker);
+ }
+
+ if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+ // Multi-threaded tile decoder
*p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
if (!xd->corrupted) {
// If multiple threads are used to decode tiles, then we use those threads
// to do parallel loopfiltering.
- vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+ vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
+ 0, 0, pbi->tile_workers, pbi->num_tile_workers,
+ &pbi->lf_row_sync);
+ } else {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data is corrupted.");
+
}
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
}
- new_fb->corrupted |= xd->corrupted;
-
- if (!new_fb->corrupted) {
+ if (!xd->corrupted) {
if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
vp9_adapt_coef_probs(cm);
@@ -1579,6 +1717,331 @@ void vp9_decode_frame(VP9Decoder *pbi,
"Decode failed. Frame data is corrupted.");
}
- if (cm->refresh_frame_context)
- cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+ // Non frame parallel update frame context here.
+ if (cm->refresh_frame_context && !context_updated)
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int x, int y, int b_w, int b_h, int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint8_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w)
+ left = b_w;
+
+ if (x + b_w > w)
+ right = x + b_w - w;
+
+ if (right > b_w)
+ right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left)
+ memset(dst, ref_row[0], left);
+
+ if (copy)
+ memcpy(dst + left, ref_row + x + left, copy);
+
+ if (right)
+ memset(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h)
+ ref_row += src_stride;
+ } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+ uint16_t *dst, int dst_stride,
+ int x, int y, int b_w, int b_h,
+ int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w)
+ left = b_w;
+
+ if (x + b_w > w)
+ right = x + b_w - w;
+
+ if (right > b_w)
+ right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left)
+ vpx_memset16(dst, ref_row[0], left);
+
+ if (copy)
+ memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+ if (right)
+ vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h)
+ ref_row += src_stride;
+ } while (--b_h);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+ int plane, int bw, int bh, int x,
+ int y, int w, int h, int mi_x, int mi_y,
+ const InterpKernel *kernel,
+ const struct scale_factors *sf,
+ struct buf_2d *pre_buf, struct buf_2d *dst_buf,
+ const MV* mv, RefCntBuffer *ref_frame_buf,
+ int is_scaled, int ref) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ MV32 scaled_mv;
+ int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+ buf_stride, subpel_x, subpel_y;
+ uint8_t *ref_frame, *buf_ptr;
+
+ // Get reference frame pointer, width and height.
+ if (plane == 0) {
+ frame_width = ref_frame_buf->buf.y_crop_width;
+ frame_height = ref_frame_buf->buf.y_crop_height;
+ ref_frame = ref_frame_buf->buf.y_buffer;
+ } else {
+ frame_width = ref_frame_buf->buf.uv_crop_width;
+ frame_height = ref_frame_buf->buf.uv_crop_height;
+ ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+ : ref_frame_buf->buf.v_buffer;
+ }
+
+ if (is_scaled) {
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
+ // Co-ordinate of containing block to pixel precision.
+ int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+ int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+ // Co-ordinate of the block to 1/16th pixel precision.
+ x0_16 = (x_start + x) << SUBPEL_BITS;
+ y0_16 = (y_start + y) << SUBPEL_BITS;
+
+ // Co-ordinate of current block in reference frame
+ // to 1/16th pixel precision.
+ x0_16 = sf->scale_value_x(x0_16, sf);
+ y0_16 = sf->scale_value_y(y0_16, sf);
+
+ // Map the top left corner of the block into the reference frame.
+ x0 = sf->scale_value_x(x_start + x, sf);
+ y0 = sf->scale_value_y(y_start + y, sf);
+
+ // Scale the MV and incorporate the sub-pixel offset of the block
+ // in the reference frame.
+ scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+ xs = sf->x_step_q4;
+ ys = sf->y_step_q4;
+ } else {
+ // Co-ordinate of containing block to pixel precision.
+ x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+ y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+ // Co-ordinate of the block to 1/16th pixel precision.
+ x0_16 = x0 << SUBPEL_BITS;
+ y0_16 = y0 << SUBPEL_BITS;
+
+ scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+ scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+ xs = ys = 16;
+ }
+ subpel_x = scaled_mv.col & SUBPEL_MASK;
+ subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+ // Calculate the top left corner of the best matching block in the
+ // reference frame.
+ x0 += scaled_mv.col >> SUBPEL_BITS;
+ y0 += scaled_mv.row >> SUBPEL_BITS;
+ x0_16 += scaled_mv.col;
+ y0_16 += scaled_mv.row;
+
+ // Get reference block pointer.
+ buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+ buf_stride = pre_buf->stride;
+
+ // Do border extension if there is motion or the
+ // width/height is not a multiple of 8 pixels.
+ if (is_scaled || scaled_mv.col || scaled_mv.row ||
+ (frame_width & 0x7) || (frame_height & 0x7)) {
+ int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+
+ // Get reference block bottom right horizontal coordinate.
+ int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
+ int x_pad = 0, y_pad = 0;
+
+ if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+ x0 -= VP9_INTERP_EXTEND - 1;
+ x1 += VP9_INTERP_EXTEND;
+ x_pad = 1;
+ }
+
+ if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+ y0 -= VP9_INTERP_EXTEND - 1;
+ y1 += VP9_INTERP_EXTEND;
+ y_pad = 1;
+ }
+
+ // Wait until reference block is ready. Pad 7 more pixels as last 7
+ // pixels of each superblock row can be changed by next superblock row.
+ if (pbi->frame_parallel_decode)
+ vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+ MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+ // Skip border extension if block is inside the frame.
+ if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+ y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+ uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+ // Extend the border.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_build_mc_border(buf_ptr1,
+ pre_buf->stride,
+ xd->mc_buf_high,
+ x1 - x0 + 1,
+ x0,
+ y0,
+ x1 - x0 + 1,
+ y1 - y0 + 1,
+ frame_width,
+ frame_height);
+ buf_stride = x1 - x0 + 1;
+ buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
+ y_pad * 3 * buf_stride + x_pad * 3;
+ } else {
+ build_mc_border(buf_ptr1,
+ pre_buf->stride,
+ xd->mc_buf,
+ x1 - x0 + 1,
+ x0,
+ y0,
+ x1 - x0 + 1,
+ y1 - y0 + 1,
+ frame_width,
+ frame_height);
+ buf_stride = x1 - x0 + 1;
+ buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+ }
+#else
+ build_mc_border(buf_ptr1,
+ pre_buf->stride,
+ xd->mc_buf,
+ x1 - x0 + 1,
+ x0,
+ y0,
+ x1 - x0 + 1,
+ y1 - y0 + 1,
+ frame_width,
+ frame_height);
+ buf_stride = x1 - x0 + 1;
+ buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ } else {
+ // Wait until reference block is ready. Pad 7 more pixels as last 7
+ // pixels of each superblock row can be changed by next superblock row.
+ if (pbi->frame_parallel_decode) {
+ const int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+ vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+ MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+ }
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+ } else {
+ inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+ }
+#else
+ inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const MODE_INFO *mi = xd->mi[0];
+ const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+ const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+ const int is_compound = has_second_ref(&mi->mbmi);
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+ &xd->plane[plane]);
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+ int ref;
+
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ const int idx = xd->block_refs[ref]->idx;
+ BufferPool *const pool = pbi->common.buffer_pool;
+ RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+ const int is_scaled = vp9_is_scaled(sf);
+
+ if (sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
+ assert(bsize == BLOCK_8X8);
+ for (y = 0; y < num_4x4_h; ++y) {
+ for (x = 0; x < num_4x4_w; ++x) {
+ const MV mv = average_split_mvs(pd, mi, ref, i++);
+ dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+ sf, pre_buf, dst_buf, &mv,
+ ref_frame_buf, is_scaled, ref);
+ }
+ }
+ } else {
+ const MV mv = mi->mbmi.mv[ref].as_mv;
+ dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ 0, 0, bw, bh, mi_x, mi_y, kernel,
+ sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+ is_scaled, ref);
+ }
+ }
+ }
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h
index 10a9e34629b..8410c541e45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -31,6 +31,9 @@ void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
int *width, int *height);
BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
+void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
+ MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
index a01fe842ee2..ce6ff997778 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
@@ -27,29 +27,31 @@ static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
}
-static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
- int size_group) {
+static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, FRAME_COUNTS *counts,
+ vp9_reader *r, int size_group) {
const PREDICTION_MODE y_mode =
- read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
+ read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.y_mode[size_group][y_mode];
+ ++counts->y_mode[size_group][y_mode];
return y_mode;
}
-static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
+static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, FRAME_COUNTS *counts,
+ vp9_reader *r,
PREDICTION_MODE y_mode) {
const PREDICTION_MODE uv_mode = read_intra_mode(r,
- cm->fc.uv_mode_prob[y_mode]);
+ cm->fc->uv_mode_prob[y_mode]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.uv_mode[y_mode][uv_mode];
+ ++counts->uv_mode[y_mode][uv_mode];
return uv_mode;
}
-static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) {
+static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, FRAME_COUNTS *counts,
+ vp9_reader *r, int ctx) {
const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
- cm->fc.inter_mode_probs[ctx]);
+ cm->fc->inter_mode_probs[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.inter_mode[ctx][mode];
+ ++counts->inter_mode[ctx][mode];
return NEARESTMV + mode;
}
@@ -59,9 +61,10 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
}
static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts,
TX_SIZE max_tx_size, vp9_reader *r) {
const int ctx = vp9_get_tx_size_context(xd);
- const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
+ const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
int tx_size = vp9_read(r, tx_probs[0]);
if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
tx_size += vp9_read(r, tx_probs[1]);
@@ -70,15 +73,18 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
}
if (!cm->frame_parallel_decoding_mode)
- ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size];
+ ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
return (TX_SIZE)tx_size;
}
-static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode,
- BLOCK_SIZE bsize, int allow_select, vp9_reader *r) {
+static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts,
+ int allow_select, vp9_reader *r) {
+ TX_MODE tx_mode = cm->tx_mode;
+ BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
- return read_selected_tx_size(cm, xd, max_tx_size, r);
+ return read_selected_tx_size(cm, xd, counts, max_tx_size, r);
else
return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
}
@@ -96,21 +102,40 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
for (y = 0; y < ymis; y++)
for (x = 0; x < xmis; x++)
- cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+ cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
}
-static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void copy_segment_id(const VP9_COMMON *cm,
+ const uint8_t *last_segment_ids,
+ uint8_t *current_segment_ids,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int x, y;
+
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ current_segment_ids[mi_offset + y * cm->mi_cols + x] = last_segment_ids ?
+ last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
+}
+
+static int read_intra_segment_id(VP9_COMMON *const cm, BLOCK_SIZE bsize,
int mi_row, int mi_col,
vp9_reader *r) {
struct segmentation *const seg = &cm->seg;
- const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
int segment_id;
if (!seg->enabled)
return 0; // Default for disabled segmentation
- if (!seg->update_map)
+ if (!seg->update_map) {
+ copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+ bsize, mi_row, mi_col);
return 0;
+ }
segment_id = read_segment_id(r, seg);
set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
@@ -120,17 +145,21 @@ static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
int mi_row, int mi_col, vp9_reader *r) {
struct segmentation *const seg = &cm->seg;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
int predicted_segment_id, segment_id;
if (!seg->enabled)
return 0; // Default for disabled segmentation
- predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
- bsize, mi_row, mi_col);
- if (!seg->update_map)
+ predicted_segment_id = cm->last_frame_seg_map ?
+ vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
+
+ if (!seg->update_map) {
+ copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+ bsize, mi_row, mi_col);
return predicted_segment_id;
+ }
if (seg->temporal_update) {
const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
@@ -145,31 +174,33 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
}
static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+ FRAME_COUNTS *counts,
int segment_id, vp9_reader *r) {
if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
return 1;
} else {
const int ctx = vp9_get_skip_context(xd);
- const int skip = vp9_read(r, cm->fc.skip_probs[ctx]);
+ const int skip = vp9_read(r, cm->fc->skip_probs[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.skip[ctx][skip];
+ ++counts->skip[ctx][skip];
return skip;
}
}
static void read_intra_frame_mode_info(VP9_COMMON *const cm,
MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts,
int mi_row, int mi_col, vp9_reader *r) {
- MODE_INFO *const mi = xd->mi[0].src_mi;
+ MODE_INFO *const mi = xd->mi[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
- const MODE_INFO *above_mi = xd->mi[-cm->mi_stride].src_mi;
- const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
const BLOCK_SIZE bsize = mbmi->sb_type;
int i;
- mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
- mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
- mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
+ mbmi->segment_id = read_intra_segment_id(cm, bsize, mi_row, mi_col, r);
+ mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r);
+ mbmi->tx_size = read_tx_size(cm, xd, counts, 1, r);
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->ref_frame[1] = NONE;
@@ -254,13 +285,14 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
const MACROBLOCKD *xd,
+ FRAME_COUNTS *counts,
vp9_reader *r) {
if (cm->reference_mode == REFERENCE_MODE_SELECT) {
const int ctx = vp9_get_reference_mode_context(cm, xd);
const REFERENCE_MODE mode =
- (REFERENCE_MODE)vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+ (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.comp_inter[ctx][mode];
+ ++counts->comp_inter[ctx][mode];
return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE
} else {
return cm->reference_mode;
@@ -269,17 +301,16 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
// Read the referncence frame
static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- vp9_reader *r,
+ FRAME_COUNTS *counts, vp9_reader *r,
int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
- FRAME_CONTEXT *const fc = &cm->fc;
- FRAME_COUNTS *const counts = &cm->counts;
+ FRAME_CONTEXT *const fc = cm->fc;
if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
ref_frame[1] = NONE;
} else {
- const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+ const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, counts, r);
// FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
if (mode == COMPOUND_REFERENCE) {
const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
@@ -313,17 +344,19 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static INLINE INTERP_FILTER read_switchable_interp_filter(
- VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
+ VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts, vp9_reader *r) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
const INTERP_FILTER type =
(INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree,
- cm->fc.switchable_interp_prob[ctx]);
+ cm->fc->switchable_interp_prob[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.switchable_interp[ctx][type];
+ ++counts->switchable_interp[ctx][type];
return type;
}
-static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
+static void read_intra_block_mode_info(VP9_COMMON *const cm,
+ FRAME_COUNTS *counts, MODE_INFO *mi,
vp9_reader *r) {
MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mi->mbmi.sb_type;
@@ -335,24 +368,26 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
switch (bsize) {
case BLOCK_4X4:
for (i = 0; i < 4; ++i)
- mi->bmi[i].as_mode = read_intra_mode_y(cm, r, 0);
+ mi->bmi[i].as_mode = read_intra_mode_y(cm, counts, r, 0);
mbmi->mode = mi->bmi[3].as_mode;
break;
case BLOCK_4X8:
- mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, r, 0);
+ mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, counts,
+ r, 0);
mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
- read_intra_mode_y(cm, r, 0);
+ read_intra_mode_y(cm, counts, r, 0);
break;
case BLOCK_8X4:
- mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, r, 0);
+ mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, counts,
+ r, 0);
mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
- read_intra_mode_y(cm, r, 0);
+ read_intra_mode_y(cm, counts, r, 0);
break;
default:
- mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+ mbmi->mode = read_intra_mode_y(cm, counts, r, size_group_lookup[bsize]);
}
- mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+ mbmi->uv_mode = read_intra_mode_uv(cm, counts, r, mbmi->mode);
}
static INLINE int is_mv_valid(const MV *mv) {
@@ -360,7 +395,8 @@ static INLINE int is_mv_valid(const MV *mv) {
mv->col > MV_LOW && mv->col < MV_UPP;
}
-static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
+static INLINE int assign_mv(VP9_COMMON *cm, FRAME_COUNTS *counts,
+ PREDICTION_MODE mode,
int_mv mv[2], int_mv ref_mv[2],
int_mv nearest_mv[2], int_mv near_mv[2],
int is_compound, int allow_hp, vp9_reader *r) {
@@ -370,9 +406,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
switch (mode) {
case NEWMV: {
nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
- NULL : &cm->counts.mv;
+ NULL : &counts->mv;
for (i = 0; i < 1 + is_compound; ++i) {
- read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+ read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
allow_hp);
ret = ret && is_mv_valid(&mv[i].as_mv);
}
@@ -404,32 +440,40 @@ static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
}
static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts,
int segment_id, vp9_reader *r) {
if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
INTRA_FRAME;
} else {
const int ctx = vp9_get_intra_inter_context(xd);
- const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]);
+ const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.intra_inter[ctx][is_inter];
+ ++counts->intra_inter[ctx][is_inter];
return is_inter;
}
}
-static void read_inter_block_mode_info(VP9_COMMON *const cm,
+static void fpm_sync(void *const data, int mi_row) {
+ VP9Decoder *const pbi = (VP9Decoder *)data;
+ vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
+ mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts,
const TileInfo *const tile,
MODE_INFO *const mi,
int mi_row, int mi_col, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
const int allow_hp = cm->allow_high_precision_mv;
-
int_mv nearestmv[2], nearmv[2];
int inter_mode_ctx, ref, is_compound;
- read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+ read_ref_frames(cm, xd, counts, r, mbmi->segment_id, mbmi->ref_frame);
is_compound = has_second_ref(mbmi);
for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -437,15 +481,12 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
xd->block_refs[ref] = ref_buf;
if ((!vp9_is_valid_scale(&ref_buf->sf)))
- vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+ vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
- if (ref_buf->buf->corrupted)
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Block reference is corrupt");
vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
&ref_buf->sf);
vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
- mi_row, mi_col);
+ mi_row, mi_col, fpm_sync, (void *)pbi);
}
inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
@@ -453,13 +494,13 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
mbmi->mode = ZEROMV;
if (bsize < BLOCK_8X8) {
- vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+ vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid usage of segement feature on small blocks");
return;
}
} else {
if (bsize >= BLOCK_8X8)
- mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
+ mbmi->mode = read_inter_mode(cm, counts, r, inter_mode_ctx);
}
if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -470,7 +511,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
}
mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
- ? read_switchable_interp_filter(cm, xd, r)
+ ? read_switchable_interp_filter(cm, xd, counts, r)
: cm->interp_filter;
if (bsize < BLOCK_8X8) {
@@ -483,7 +524,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
for (idx = 0; idx < 2; idx += num_4x4_w) {
int_mv block[2];
const int j = idy * 2 + idx;
- b_mode = read_inter_mode(cm, r, inter_mode_ctx);
+ b_mode = read_inter_mode(cm, counts, r, inter_mode_ctx);
if (b_mode == NEARESTMV || b_mode == NEARMV)
for (ref = 0; ref < 1 + is_compound; ++ref)
@@ -491,7 +532,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
&nearest_sub8x8[ref],
&near_sub8x8[ref]);
- if (!assign_mv(cm, b_mode, block, nearestmv,
+ if (!assign_mv(cm, counts, b_mode, block, nearestmv,
nearest_sub8x8, near_sub8x8,
is_compound, allow_hp, r)) {
xd->corrupted |= 1;
@@ -514,38 +555,60 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
} else {
- xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
+ xd->corrupted |= !assign_mv(cm, counts, mbmi->mode, mbmi->mv, nearestmv,
nearestmv, nearmv, is_compound, allow_hp, r);
}
}
-static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
MACROBLOCKD *const xd,
+ FRAME_COUNTS *counts,
const TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r) {
- MODE_INFO *const mi = xd->mi[0].src_mi;
+ VP9_COMMON *const cm = &pbi->common;
+ MODE_INFO *const mi = xd->mi[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
int inter_block;
mbmi->mv[0].as_int = 0;
mbmi->mv[1].as_int = 0;
mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
- mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
- inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
- mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
- !mbmi->skip || !inter_block, r);
+ mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r);
+ inter_block = read_is_inter_block(cm, xd, counts, mbmi->segment_id, r);
+ mbmi->tx_size = read_tx_size(cm, xd, counts, !mbmi->skip || !inter_block, r);
if (inter_block)
- read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+ read_inter_block_mode_info(pbi, xd, counts, tile, mi, mi_row, mi_col, r);
else
- read_intra_block_mode_info(cm, mi, r);
+ read_intra_block_mode_info(cm, counts, mi, r);
}
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts,
const TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MODE_INFO *const mi = xd->mi[0];
+ const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+ const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+ const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+ MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
if (frame_is_intra_only(cm))
- read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+ read_intra_frame_mode_info(cm, xd, counts, mi_row, mi_col, r);
else
- read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+ read_inter_frame_mode_info(pbi, xd, counts, tile, mi_row, mi_col, r);
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ }
+ }
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h
index 7394b62b451..c79dff71888 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h
@@ -11,6 +11,7 @@
#ifndef VP9_DECODER_VP9_DECODEMV_H_
#define VP9_DECODER_VP9_DECODEMV_H_
+#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_reader.h"
#ifdef __cplusplus
@@ -19,7 +20,8 @@ extern "C" {
struct TileInfo;
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts,
const struct TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c
index baf6ab7ef52..288d8690ca2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c
@@ -12,9 +12,12 @@
#include <limits.h>
#include <stdio.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
@@ -27,25 +30,53 @@
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_decodeframe.h"
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_detokenize.h"
-#include "vp9/decoder/vp9_dthread.h"
-static void initialize_dec() {
- static int init_done = 0;
+static void initialize_dec(void) {
+ static volatile int init_done = 0;
if (!init_done) {
vp9_rtcd();
+ vpx_dsp_rtcd();
+ vpx_scale_rtcd();
vp9_init_intra_predictors();
init_done = 1;
}
}
-VP9Decoder *vp9_decoder_create() {
- VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
- VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
+static void vp9_dec_setup_mi(VP9_COMMON *cm) {
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ memset(cm->mi_grid_base, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
+ cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+ if (!cm->mip)
+ return 1;
+ cm->mi_alloc_size = mi_size;
+ cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+ if (!cm->mi_grid_base)
+ return 1;
+ return 0;
+}
+
+static void vp9_dec_free_mi(VP9_COMMON *cm) {
+ vpx_free(cm->mip);
+ cm->mip = NULL;
+ vpx_free(cm->mi_grid_base);
+ cm->mi_grid_base = NULL;
+}
+
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
+ VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi));
+ VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
if (!cm)
return NULL;
@@ -59,21 +90,30 @@ VP9Decoder *vp9_decoder_create() {
}
cm->error.setjmp = 1;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(cm, cm->frame_contexts,
+ (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+ sizeof(*cm->frame_contexts)));
+
pbi->need_resync = 1;
- initialize_dec();
+ once(initialize_dec);
// Initialize the references to not point to any frame buffers.
- vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
cm->current_video_frame = 0;
pbi->ready_for_new_data = 1;
+ pbi->common.buffer_pool = pool;
+
cm->bit_depth = VPX_BITS_8;
cm->dequant_bit_depth = VPX_BITS_8;
- // vp9_init_dequantizer() is first called here. Add check in
- // frame_init_dequantizer() to avoid unnecessary calling of
- // vp9_init_dequantizer() for every frame.
- vp9_init_dequantizer(cm);
+ cm->alloc_mi = vp9_dec_alloc_mi;
+ cm->free_mi = vp9_dec_free_mi;
+ cm->setup_mi = vp9_dec_setup_mi;
vp9_loop_filter_init(cm);
@@ -85,7 +125,6 @@ VP9Decoder *vp9_decoder_create() {
}
void vp9_decoder_remove(VP9Decoder *pbi) {
- VP9_COMMON *const cm = &pbi->common;
int i;
vp9_get_worker_interface()->end(&pbi->lf_worker);
@@ -94,16 +133,15 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
for (i = 0; i < pbi->num_tile_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
vp9_get_worker_interface()->end(worker);
- vpx_free(worker->data1);
- vpx_free(worker->data2);
}
+ vpx_free(pbi->tile_worker_data);
+ vpx_free(pbi->tile_worker_info);
vpx_free(pbi->tile_workers);
if (pbi->num_tile_workers > 0) {
vp9_loop_filter_dealloc(&pbi->lf_row_sync);
}
- vp9_remove_common(cm);
vpx_free(pbi);
}
@@ -148,6 +186,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
VP9_REFFRAME ref_frame_flag,
YV12_BUFFER_CONFIG *sd) {
RefBuffer *ref_buf = NULL;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
// TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
// encoder is using the frame buffers for. This is just a stub to keep the
@@ -175,11 +214,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
const int free_fb = get_free_fb(cm);
// Decrease ref_count since it will be increased again in
// ref_cnt_fb() below.
- cm->frame_bufs[free_fb].ref_count--;
+ --frame_bufs[free_fb].ref_count;
// Manage the reference counters and copy image.
- ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
- ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
+ ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+ ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
vp8_yv12_copy_frame(sd, ref_buf->buf);
}
@@ -190,33 +229,51 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
static void swap_frame_buffers(VP9Decoder *pbi) {
int ref_index = 0, mask;
VP9_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ lock_buffer_pool(pool);
for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
- if (mask & 1) {
- const int old_idx = cm->ref_frame_map[ref_index];
- ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
- cm->new_fb_idx);
- if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
- cm->release_fb_cb(cm->cb_priv,
- &cm->frame_bufs[old_idx].raw_frame_buffer);
+ const int old_idx = cm->ref_frame_map[ref_index];
+ // Current thread releases the holding of reference frame.
+ decrease_ref_count(old_idx, frame_bufs, pool);
+
+ // Release the reference frame in reference map.
+ if ((mask & 1) && old_idx >= 0) {
+ decrease_ref_count(old_idx, frame_bufs, pool);
}
+ cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
++ref_index;
}
+ // Current thread releases the holding of reference frame.
+ for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ decrease_ref_count(old_idx, frame_bufs, pool);
+ cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+ }
+ unlock_buffer_pool(pool);
+ pbi->hold_ref_buf = 0;
cm->frame_to_show = get_frame_new_buffer(cm);
- cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+ if (!pbi->frame_parallel_decode || !cm->show_frame) {
+ lock_buffer_pool(pool);
+ --frame_bufs[cm->new_fb_idx].ref_count;
+ unlock_buffer_pool(pool);
+ }
// Invalidate these references until the next frame starts.
for (ref_index = 0; ref_index < 3; ref_index++)
- cm->frame_refs[ref_index].idx = INT_MAX;
+ cm->frame_refs[ref_index].idx = -1;
}
int vp9_receive_compressed_data(VP9Decoder *pbi,
size_t size, const uint8_t **psource) {
- VP9_COMMON *const cm = &pbi->common;
+ VP9_COMMON *volatile const cm = &pbi->common;
+ BufferPool *volatile const pool = cm->buffer_pool;
+ RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
const uint8_t *source = *psource;
int retcode = 0;
-
cm->error.error_code = VPX_CODEC_OK;
if (size == 0) {
@@ -228,57 +285,120 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
// TODO(jkoleszar): Error concealment is undefined and non-normative
// at this point, but if it becomes so, [0] may not always be the correct
// thing to do here.
- if (cm->frame_refs[0].idx != INT_MAX)
+ if (cm->frame_refs[0].idx > 0) {
+ assert(cm->frame_refs[0].buf != NULL);
cm->frame_refs[0].buf->corrupted = 1;
+ }
}
pbi->ready_for_new_data = 0;
// Check if the previous frame was a frame without any references to it.
- if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
- cm->release_fb_cb(cm->cb_priv,
- &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+ // Release frame buffer if not decoding in frame parallel mode.
+ if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+ && frame_bufs[cm->new_fb_idx].ref_count == 0)
+ pool->release_fb_cb(pool->cb_priv,
+ &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
cm->new_fb_idx = get_free_fb(cm);
+ // Assign a MV array to the frame buffer.
+ cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+ pbi->hold_ref_buf = 0;
+ if (pbi->frame_parallel_decode) {
+ VP9Worker *const worker = pbi->frame_worker_owner;
+ vp9_frameworker_lock_stats(worker);
+ frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+ // Reset decoding progress.
+ pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+ pbi->cur_buf->row = -1;
+ pbi->cur_buf->col = -1;
+ vp9_frameworker_unlock_stats(worker);
+ } else {
+ pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+ }
+
+
if (setjmp(cm->error.jmp)) {
- pbi->need_resync = 1;
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ int i;
+
cm->error.setjmp = 0;
- vp9_clear_system_state();
+ pbi->ready_for_new_data = 1;
- // We do not know if the missing frame(s) was supposed to update
- // any of the reference buffers, but we act conservative and
- // mark only the last buffer as corrupted.
- //
- // TODO(jkoleszar): Error concealment is undefined and non-normative
- // at this point, but if it becomes so, [0] may not always be the correct
- // thing to do here.
- if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
- cm->frame_refs[0].buf->corrupted = 1;
+ // Synchronize all threads immediately as a subsequent decode call may
+ // cause a resize invalidating some allocations.
+ winterface->sync(&pbi->lf_worker);
+ for (i = 0; i < pbi->num_tile_workers; ++i) {
+ winterface->sync(&pbi->tile_workers[i]);
+ }
+
+ lock_buffer_pool(pool);
+ // Release all the reference buffers if worker thread is holding them.
+ if (pbi->hold_ref_buf == 1) {
+ int ref_index = 0, mask;
+ for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ // Current thread releases the holding of reference frame.
+ decrease_ref_count(old_idx, frame_bufs, pool);
+
+ // Release the reference frame in reference map.
+ if ((mask & 1) && old_idx >= 0) {
+ decrease_ref_count(old_idx, frame_bufs, pool);
+ }
+ ++ref_index;
+ }
- if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
- cm->frame_bufs[cm->new_fb_idx].ref_count--;
+ // Current thread releases the holding of reference frame.
+ for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ decrease_ref_count(old_idx, frame_bufs, pool);
+ }
+ pbi->hold_ref_buf = 0;
+ }
+ // Release current frame.
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+ vp9_clear_system_state();
return -1;
}
cm->error.setjmp = 1;
-
vp9_decode_frame(pbi, source, source + size, psource);
swap_frame_buffers(pbi);
vp9_clear_system_state();
- cm->last_width = cm->width;
- cm->last_height = cm->height;
-
- if (!cm->show_existing_frame)
+ if (!cm->show_existing_frame) {
cm->last_show_frame = cm->show_frame;
- if (cm->show_frame) {
- if (!cm->show_existing_frame)
- vp9_swap_mi_and_prev_mi(cm);
+ cm->prev_frame = cm->cur_frame;
+ if (cm->seg.enabled && !pbi->frame_parallel_decode)
+ vp9_swap_current_and_last_seg_map(cm);
+ }
+
+ // Update progress in frame parallel decode.
+ if (pbi->frame_parallel_decode) {
+ // Need to lock the mutex here as another thread may
+ // be accessing this buffer.
+ VP9Worker *const worker = pbi->frame_worker_owner;
+ FrameWorkerData *const frame_worker_data = worker->data1;
+ vp9_frameworker_lock_stats(worker);
- cm->current_video_frame++;
+ if (cm->show_frame) {
+ cm->current_video_frame++;
+ }
+ frame_worker_data->frame_decoded = 1;
+ frame_worker_data->frame_context_ready = 1;
+ vp9_frameworker_signal_stats(worker);
+ vp9_frameworker_unlock_stats(worker);
+ } else {
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+ if (cm->show_frame) {
+ cm->current_video_frame++;
+ }
}
cm->error.setjmp = 0;
@@ -302,6 +422,8 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
if (!cm->show_frame)
return ret;
+ pbi->ready_for_new_data = 1;
+
#if CONFIG_VP9_POSTPROC
if (!cm->show_existing_frame) {
ret = vp9_post_proc_frame(cm, sd, flags);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h
index 4f52bb9c473..c19f0ac3bc7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h
@@ -15,12 +15,12 @@
#include "vpx/vpx_codec.h"
#include "vpx_scale/yv12config.h"
-
+#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_ppflags.h"
#include "vp9/common/vp9_thread.h"
-
#include "vp9/decoder/vp9_dthread.h"
+#include "vp9/decoder/vp9_reader.h"
#ifdef __cplusplus
extern "C" {
@@ -33,6 +33,14 @@ typedef struct TileData {
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
} TileData;
+typedef struct TileWorkerData {
+ struct VP9Decoder *pbi;
+ vp9_reader bit_reader;
+ FRAME_COUNTS counts;
+ DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+ struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
typedef struct VP9Decoder {
DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -44,8 +52,15 @@ typedef struct VP9Decoder {
int frame_parallel_decode; // frame-based threading.
+ // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+ // the same.
+ RefCntBuffer *cur_buf; // Current decoding frame buffer.
+
+ VP9Worker *frame_worker_owner; // frame_worker that owns this pbi.
VP9Worker lf_worker;
VP9Worker *tile_workers;
+ TileWorkerData *tile_worker_data;
+ TileInfo *tile_worker_info;
int num_tile_workers;
TileData *tile_data;
@@ -58,7 +73,8 @@ typedef struct VP9Decoder {
int max_threads;
int inv_tile_order;
- int need_resync; // wait for key/intra-only frame
+ int need_resync; // wait for key/intra-only frame.
+ int hold_ref_buf; // hold the reference buffer.
} VP9Decoder;
int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -75,10 +91,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
VP9_REFFRAME ref_frame_flag,
YV12_BUFFER_CONFIG *sd);
-struct VP9Decoder *vp9_decoder_create();
-
-void vp9_decoder_remove(struct VP9Decoder *pbi);
-
static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
void *decrypt_state,
const uint8_t *data) {
@@ -98,6 +110,25 @@ vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
vpx_decrypt_cb decrypt_cb,
void *decrypt_state);
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
+
+void vp9_decoder_remove(struct VP9Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+ BufferPool *const pool) {
+ if (idx >= 0) {
+ --frame_bufs[idx].ref_count;
+ // A worker may only get a free framebuffer index when calling get_free_fb.
+ // But the private buffer is not set up until finish decoding header.
+ // So any error happens during decoding header, the frame_bufs will not
+ // have valid priv buffer.
+ if (frame_bufs[idx].ref_count == 0 &&
+ frame_bufs[idx].raw_frame_buffer.priv) {
+ pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+ }
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c
index 421229a28c2..bb8c66fc09f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c
@@ -14,6 +14,9 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
#include "vp9/decoder/vp9_detokenize.h"
@@ -32,7 +35,7 @@
#define INCREMENT_COUNT(token) \
do { \
if (!cm->frame_parallel_decoding_mode) \
- ++coef_counts[band][ctx][token]; \
+ ++coef_counts[band][ctx][token]; \
} while (0)
static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@@ -42,25 +45,14 @@ static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
return val;
}
-static const vp9_tree_index coeff_subtree_high[TREE_SIZE(ENTROPY_TOKENS)] = {
- 2, 6, /* 0 = LOW_VAL */
- -TWO_TOKEN, 4, /* 1 = TWO */
- -THREE_TOKEN, -FOUR_TOKEN, /* 2 = THREE */
- 8, 10, /* 3 = HIGH_LOW */
- -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, /* 4 = CAT_ONE */
- 12, 14, /* 5 = CAT_THREEFOUR */
- -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, /* 6 = CAT_THREE */
- -CATEGORY5_TOKEN, -CATEGORY6_TOKEN /* 7 = CAT_FIVE */
-};
-
-static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
+static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
+ FRAME_COUNTS *counts, PLANE_TYPE type,
tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
int ctx, const int16_t *scan, const int16_t *nb,
vp9_reader *r) {
const int max_eob = 16 << (tx_size << 1);
- const FRAME_CONTEXT *const fc = &cm->fc;
- FRAME_COUNTS *const counts = &cm->counts;
- const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
+ const FRAME_CONTEXT *const fc = cm->fc;
+ const int ref = is_inter_block(&xd->mi[0]->mbmi);
int band, c = 0;
const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
fc->coef_probs[tx_size][type][ref];
@@ -144,7 +136,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
val = 1;
} else {
INCREMENT_COUNT(TWO_TOKEN);
- token = vp9_read_tree(r, coeff_subtree_high,
+ token = vp9_read_tree(r, vp9_coef_con_tree,
vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
switch (token) {
case TWO_TOKEN:
@@ -191,10 +183,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
}
v = (val * dqv) >> dq_shift;
#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+ dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+ cm->bit_depth);
+#else
dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+#endif // CONFIG_VP9_HIGHBITDEPTH
#else
dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
-#endif
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
token_cache[scan[c]] = vp9_pt_energy_class[token];
++c;
ctx = get_coef_context(nb, token_cache, c);
@@ -205,15 +202,19 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
}
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- int plane, int block, BLOCK_SIZE plane_bsize,
- int x, int y, TX_SIZE tx_size, vp9_reader *r) {
+ FRAME_COUNTS *counts, int plane, int block,
+ BLOCK_SIZE plane_bsize, int x, int y,
+ TX_SIZE tx_size, vp9_reader *r,
+ int seg_id) {
struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int16_t *const dequant = (plane == 0) ? cm->y_dequant[seg_id]
+ : cm->uv_dequant[seg_id];
const int ctx = get_entropy_context(tx_size, pd->above_context + x,
pd->left_context + y);
const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
- const int eob = decode_coefs(cm, xd, pd->plane_type,
+ const int eob = decode_coefs(cm, xd, counts, pd->plane_type,
BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
- pd->dequant, ctx, so->scan, so->neighbors, r);
+ dequant, ctx, so->scan, so->neighbors, r);
vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
return eob;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h
index 5278e97a302..86126b6a19e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h
@@ -20,8 +20,10 @@ extern "C" {
#endif
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- int plane, int block, BLOCK_SIZE plane_bsize,
- int x, int y, TX_SIZE tx_size, vp9_reader *r);
+ FRAME_COUNTS *counts, int plane, int block,
+ BLOCK_SIZE plane_bsize, int x, int y,
+ TX_SIZE tx_size, vp9_reader *r,
+ int seg_id);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c
index 69e4fde8586..96a63bd9e14 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c
@@ -9,265 +9,181 @@
*/
#include "./vpx_config.h"
-
#include "vpx_mem/vpx_mem.h"
-
#include "vp9/common/vp9_reconinter.h"
-
#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_decoder.h"
-#if CONFIG_MULTITHREAD
-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
- const int kMaxTryLocks = 4000;
- int locked = 0;
- int i;
-
- for (i = 0; i < kMaxTryLocks; ++i) {
- if (!pthread_mutex_trylock(mutex)) {
- locked = 1;
- break;
- }
- }
+// #define DEBUG_THREAD
- if (!locked)
- pthread_mutex_lock(mutex);
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp9_frameworker_lock_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+ FrameWorkerData *const worker_data = worker->data1;
+ pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+ (void)worker;
+#endif
}
-#endif // CONFIG_MULTITHREAD
-static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
#if CONFIG_MULTITHREAD
- const int nsync = lf_sync->sync_range;
-
- if (r && !(c & (nsync - 1))) {
- pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
- mutex_lock(mutex);
-
- while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
- pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
- }
- pthread_mutex_unlock(mutex);
- }
+ FrameWorkerData *const worker_data = worker->data1;
+ pthread_mutex_unlock(&worker_data->stats_mutex);
#else
- (void)lf_sync;
- (void)r;
- (void)c;
-#endif // CONFIG_MULTITHREAD
+ (void)worker;
+#endif
}
-static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
- const int sb_cols) {
+void vp9_frameworker_signal_stats(VP9Worker *const worker) {
#if CONFIG_MULTITHREAD
- const int nsync = lf_sync->sync_range;
- int cur;
- // Only signal when there are enough filtered SB for next row to run.
- int sig = 1;
+ FrameWorkerData *const worker_data = worker->data1;
- if (c < sb_cols - 1) {
- cur = c;
- if (c % nsync)
- sig = 0;
- } else {
- cur = sb_cols + nsync;
- }
-
- if (sig) {
- mutex_lock(&lf_sync->mutex_[r]);
-
- lf_sync->cur_sb_col[r] = cur;
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+ pthread_cond_signal(&worker_data->stats_cond);
+#else
+ pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
- pthread_cond_signal(&lf_sync->cond_[r]);
- pthread_mutex_unlock(&lf_sync->mutex_[r]);
- }
#else
- (void)lf_sync;
- (void)r;
- (void)c;
- (void)sb_cols;
-#endif // CONFIG_MULTITHREAD
+ (void)worker;
+#endif
}
-// Implement row loopfiltering for each thread.
-static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
- VP9_COMMON *const cm,
- struct macroblockd_plane planes[MAX_MB_PLANE],
- int start, int stop, int y_only,
- VP9LfSync *const lf_sync, int num_lf_workers) {
- const int num_planes = y_only ? 1 : MAX_MB_PLANE;
- int r, c; // SB row and col
- const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
-
- for (r = start; r < stop; r += num_lf_workers) {
- const int mi_row = r << MI_BLOCK_SIZE_LOG2;
- MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
-
- for (c = 0; c < sb_cols; ++c) {
- const int mi_col = c << MI_BLOCK_SIZE_LOG2;
- LOOP_FILTER_MASK lfm;
- int plane;
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
- sync_read(lf_sync, r, c);
-
- vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
- vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+ int row) {
+#if CONFIG_MULTITHREAD
+ if (!ref_buf)
+ return;
- for (plane = 0; plane < num_planes; ++plane) {
- vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
- }
+#ifndef BUILDING_WITH_TSAN
+ // The following line of code will get harmless tsan error but it is the key
+ // to get best performance.
+ if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
- sync_write(lf_sync, r, c, sb_cols);
+ {
+ // Find the worker thread that owns the reference frame. If the reference
+ // frame has been fully decoded, it may not have owner.
+ VP9Worker *const ref_worker = ref_buf->frame_worker_owner;
+ FrameWorkerData *const ref_worker_data =
+ (FrameWorkerData *)ref_worker->data1;
+ const VP9Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+ {
+ FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+ printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n",
+ worker_data->worker_id, worker, ref_worker_data->worker_id,
+ ref_buf->frame_worker_owner, row, ref_buf->row);
}
- }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(TileWorkerData *const tile_data,
- void *unused) {
- LFWorkerData *const lf_data = &tile_data->lfdata;
- (void)unused;
- loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
- lf_data->start, lf_data->stop, lf_data->y_only,
- lf_data->lf_sync, lf_data->num_lf_workers);
- return 1;
-}
-
-// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
-// threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
- VP9Decoder *pbi, VP9_COMMON *cm,
- int frame_filter_level,
- int y_only) {
- VP9LfSync *const lf_sync = &pbi->lf_row_sync;
- const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
- // Number of superblock rows and cols
- const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
- const int tile_cols = 1 << cm->log2_tile_cols;
- const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
- int i;
-
- if (!frame_filter_level) return;
-
- if (!lf_sync->sync_range || cm->last_height != cm->height) {
- vp9_loop_filter_dealloc(lf_sync);
- vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
- }
-
- vp9_loop_filter_frame_init(cm, frame_filter_level);
-
- // Initialize cur_sb_col to -1 for all SB rows.
- vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-
- // Set up loopfilter thread data.
- // The decoder is using num_workers instead of pbi->num_tile_workers
- // because it has been observed that using more threads on the
- // loopfilter, than there are tile columns in the frame will hurt
- // performance on Android. This is because the system will only
- // schedule the tile decode workers on cores equal to the number
- // of tile columns. Then if the decoder tries to use more threads for the
- // loopfilter, it will hurt performance because of contention. If the
- // multithreading code changes in the future then the number of workers
- // used by the loopfilter should be revisited.
- for (i = 0; i < num_workers; ++i) {
- VP9Worker *const worker = &pbi->tile_workers[i];
- TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
- LFWorkerData *const lf_data = &tile_data->lfdata;
-
- worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+#endif
- // Loopfilter data
- lf_data->frame_buffer = frame;
- lf_data->cm = cm;
- vp9_copy(lf_data->planes, pbi->mb.plane);
- lf_data->start = i;
- lf_data->stop = sb_rows;
- lf_data->y_only = y_only; // always do all planes in decoder
-
- lf_data->lf_sync = lf_sync;
- lf_data->num_lf_workers = num_workers;
-
- // Start loopfiltering
- if (i == num_workers - 1) {
- winterface->execute(worker);
- } else {
- winterface->launch(worker);
+ vp9_frameworker_lock_stats(ref_worker);
+ while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+ ref_buf->buf.corrupted != 1) {
+ pthread_cond_wait(&ref_worker_data->stats_cond,
+ &ref_worker_data->stats_mutex);
}
- }
- // Wait till all rows are finished
- for (i = 0; i < num_workers; ++i) {
- winterface->sync(&pbi->tile_workers[i]);
+ if (ref_buf->buf.corrupted == 1) {
+ FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+ vp9_frameworker_unlock_stats(ref_worker);
+ vpx_internal_error(&worker_data->pbi->common.error,
+ VPX_CODEC_CORRUPT_FRAME,
+ "Worker %p failed to decode frame", worker);
+ }
+ vp9_frameworker_unlock_stats(ref_worker);
}
+#else
+ (void)worker;
+ (void)ref_buf;
+ (void)row;
+ (void)ref_buf;
+#endif // CONFIG_MULTITHREAD
}
-// Set up nsync by width.
-static int get_sync_range(int width) {
- // nsync numbers are picked by testing. For example, for 4k
- // video, using 4 gives best performance.
- if (width < 640)
- return 1;
- else if (width <= 1280)
- return 2;
- else if (width <= 4096)
- return 4;
- else
- return 8;
-}
-
-// Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
- int width) {
- lf_sync->rows = rows;
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
#if CONFIG_MULTITHREAD
- {
- int i;
-
- CHECK_MEM_ERROR(cm, lf_sync->mutex_,
- vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
- if (lf_sync->mutex_) {
- for (i = 0; i < rows; ++i) {
- pthread_mutex_init(&lf_sync->mutex_[i], NULL);
- }
- }
+ VP9Worker *worker = buf->frame_worker_owner;
- CHECK_MEM_ERROR(cm, lf_sync->cond_,
- vpx_malloc(sizeof(*lf_sync->cond_) * rows));
- if (lf_sync->cond_) {
- for (i = 0; i < rows; ++i) {
- pthread_cond_init(&lf_sync->cond_[i], NULL);
- }
- }
+#ifdef DEBUG_THREAD
+ {
+ FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+ printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+ buf->frame_worker_owner, row);
}
-#endif // CONFIG_MULTITHREAD
+#endif
- CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
- vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
-
- // Set up nsync.
- lf_sync->sync_range = get_sync_range(width);
+ vp9_frameworker_lock_stats(worker);
+ buf->row = row;
+ vp9_frameworker_signal_stats(worker);
+ vp9_frameworker_unlock_stats(worker);
+#else
+ (void)buf;
+ (void)row;
+#endif // CONFIG_MULTITHREAD
}
-// Deallocate lf synchronization related mutex and data
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
- if (lf_sync != NULL) {
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+ VP9Worker *const src_worker) {
#if CONFIG_MULTITHREAD
- int i;
+ FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+ FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+ VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+ VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+ int i;
- if (lf_sync->mutex_ != NULL) {
- for (i = 0; i < lf_sync->rows; ++i) {
- pthread_mutex_destroy(&lf_sync->mutex_[i]);
- }
- vpx_free(lf_sync->mutex_);
- }
- if (lf_sync->cond_ != NULL) {
- for (i = 0; i < lf_sync->rows; ++i) {
- pthread_cond_destroy(&lf_sync->cond_[i]);
- }
- vpx_free(lf_sync->cond_);
- }
-#endif // CONFIG_MULTITHREAD
- vpx_free(lf_sync->cur_sb_col);
- // clear the structure as the source of this call may be a resize in which
- // case this call will be followed by an _alloc() which may fail.
- vp9_zero(*lf_sync);
+ // Wait until source frame's context is ready.
+ vp9_frameworker_lock_stats(src_worker);
+ while (!src_worker_data->frame_context_ready) {
+ pthread_cond_wait(&src_worker_data->stats_cond,
+ &src_worker_data->stats_mutex);
}
+
+ dst_cm->last_frame_seg_map = src_cm->seg.enabled ?
+ src_cm->current_frame_seg_map : src_cm->last_frame_seg_map;
+ dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+ vp9_frameworker_unlock_stats(src_worker);
+
+ dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+ dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
+ dst_cm->prev_frame = src_cm->show_existing_frame ?
+ src_cm->prev_frame : src_cm->cur_frame;
+ dst_cm->last_width = !src_cm->show_existing_frame ?
+ src_cm->width : src_cm->last_width;
+ dst_cm->last_height = !src_cm->show_existing_frame ?
+ src_cm->height : src_cm->last_height;
+ dst_cm->subsampling_x = src_cm->subsampling_x;
+ dst_cm->subsampling_y = src_cm->subsampling_y;
+ dst_cm->frame_type = src_cm->frame_type;
+ dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+ src_cm->show_frame : src_cm->last_show_frame;
+ for (i = 0; i < REF_FRAMES; ++i)
+ dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+ memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+ (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+ dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+ dst_cm->lf.filter_level = src_cm->lf.filter_level;
+ memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
+ memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+ dst_cm->seg = src_cm->seg;
+ memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+ FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+ (void) dst_worker;
+ (void) src_worker;
+#endif // CONFIG_MULTITHREAD
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h
index b1fbdeb74a0..979cb3d8bd6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h
@@ -13,46 +13,54 @@
#include "./vpx_config.h"
#include "vp9/common/vp9_thread.h"
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx/internal/vpx_codec_internal.h"
struct VP9Common;
struct VP9Decoder;
-typedef struct TileWorkerData {
- struct VP9Common *cm;
- vp9_reader bit_reader;
- DECLARE_ALIGNED(16, struct macroblockd, xd);
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+ struct VP9Decoder *pbi;
+ const uint8_t *data;
+ const uint8_t *data_end;
+ size_t data_size;
+ void *user_priv;
+ int result;
+ int worker_id;
+ int received_frame;
- // Row-based parallel loopfilter data
- LFWorkerData lfdata;
-} TileWorkerData;
+ // scratch_buffer is used in frame parallel mode only.
+ // It is used to make a copy of the compressed data.
+ uint8_t *scratch_buffer;
+ size_t scratch_buffer_size;
-// Loopfilter row synchronization
-typedef struct VP9LfSyncData {
#if CONFIG_MULTITHREAD
- pthread_mutex_t *mutex_;
- pthread_cond_t *cond_;
+ pthread_mutex_t stats_mutex;
+ pthread_cond_t stats_cond;
#endif
- // Allocate memory to store the loop-filtered superblock index in each row.
- int *cur_sb_col;
- // The optimal sync_range for different resolution and platform should be
- // determined by testing. Currently, it is chosen to be a power-of-2 number.
- int sync_range;
- int rows;
-} VP9LfSync;
-
-// Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
- int width);
-
-// Deallocate loopfilter synchronization related mutex and data.
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
-
-// Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
- struct VP9Decoder *pbi,
- struct VP9Common *cm,
- int frame_filter_level,
- int y_only);
+
+ int frame_context_ready; // Current frame's context is ready to read.
+ int frame_decoded; // Finished decoding current frame.
+} FrameWorkerData;
+
+void vp9_frameworker_lock_stats(VP9Worker *const worker);
+void vp9_frameworker_unlock_stats(VP9Worker *const worker);
+void vp9_frameworker_signal_stats(VP9Worker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+ int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+ VP9Worker *const src_worker);
#endif // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c
index 3eef72844c1..c3b38a9c710 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,20 +10,20 @@
#include "vp9/decoder/vp9_read_bit_buffer.h"
size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
- return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
+ return (rb->bit_offset + 7) >> 3;
}
int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
const size_t off = rb->bit_offset;
- const size_t p = off / CHAR_BIT;
- const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
- if (rb->bit_buffer + p >= rb->bit_buffer_end) {
- rb->error_handler(rb->error_handler_data);
- return 0;
- } else {
- const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+ const size_t p = off >> 3;
+ const int q = 7 - (int)(off & 0x7);
+ if (rb->bit_buffer + p < rb->bit_buffer_end) {
+ const int bit = (rb->bit_buffer[p] >> q) & 1;
rb->bit_offset = off + 1;
return bit;
+ } else {
+ rb->error_handler(rb->error_handler_data);
+ return 0;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h
index 2d9eccfbf93..a68a1d5925b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h
@@ -30,14 +30,15 @@ typedef size_t BD_VALUE;
#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
typedef struct {
- const uint8_t *buffer_end;
- const uint8_t *buffer;
- uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+ // Be careful when reordering this struct, it may impact the cache negatively.
BD_VALUE value;
- int count;
unsigned int range;
+ int count;
+ const uint8_t *buffer_end;
+ const uint8_t *buffer;
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
+ uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
} vp9_reader;
int vp9_reader_init(vp9_reader *r,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c
new file mode 100644
index 00000000000..f505fcb7ac0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+ const uint32x4_t a = vpaddlq_u16(v_16x8);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+ uint8x8_t v_s0 = vld1_u8(s);
+ const uint8x8_t v_s1 = vld1_u8(s + p);
+ uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+ v_s0 = vld1_u8(s + 2 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 3 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 4 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 5 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 6 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ v_s0 = vld1_u8(s + 7 * p);
+ v_sum = vaddw_u8(v_sum, v_s0);
+
+ return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
index 6c66f5d5bc9..a6d4797adae 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -32,6 +32,24 @@ void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
}
}
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
+ int16_t* coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t* zbin_ptr,
+ const int16_t* round_ptr, const int16_t* quant_ptr,
+ const int16_t* quant_shift_ptr,
+ int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+ const int16_t* dequant_ptr, uint16_t* eob_ptr,
+ const int16_t* scan_ptr,
+ const int16_t* iscan_ptr) {
+ int16_t temp_buffer[64];
+ (void)coeff_ptr;
+
+ vp9_fdct8x8_neon(input, temp_buffer, stride);
+ vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
+
void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
int i;
// stage 1
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 8c13d0da672..47363c75ba5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,13 +26,12 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)scan;
if (!skip_block) {
@@ -112,8 +111,8 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
*eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
}
} else {
- vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
*eob_ptr = 0;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
index 816fbda1fbe..cf82dd75d92 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -10,6 +10,7 @@
#include <arm_neon.h>
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
@@ -19,18 +20,6 @@
#include "vp9/encoder/vp9_variance.h"
-enum { kWidth8 = 8 };
-enum { kHeight8 = 8 };
-enum { kHeight8PlusOne = 9 };
-enum { kWidth16 = 16 };
-enum { kHeight16 = 16 };
-enum { kHeight16PlusOne = 17 };
-enum { kWidth32 = 32 };
-enum { kHeight32 = 32 };
-enum { kHeight32PlusOne = 33 };
-enum { kPixelStepOne = 1 };
-enum { kAlign16 = 16 };
-
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
const int32x4_t a = vpaddlq_s16(v_16x8);
const int64x2_t b = vpaddlq_s32(a);
@@ -46,9 +35,10 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
return vget_lane_s32(c, 0);
}
+// w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+ int w, int h, uint32_t *sse, int *sum) {
int i, j;
int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -79,31 +69,31 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,
- kHeight8, sse, sum);
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
+ 8, sse, sum);
}
unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
- return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+ variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
}
void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
- kHeight16, sse, sum);
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
+ 16, sse, sum);
}
unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
- return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+ variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+ return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
}
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -164,15 +154,15 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+ DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
- var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
- kHeight8PlusOne, kWidth8,
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+ 9, 8,
BILINEAR_FILTERS_2TAP(xoffset));
- var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
- kWidth8, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+ var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+ 8, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@@ -182,30 +172,85 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+ DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
- var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
- kHeight16PlusOne, kWidth16,
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+ 17, 16,
BILINEAR_FILTERS_2TAP(xoffset));
- var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
- kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+ var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
+ 16, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
- variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,
- kHeight32, sse, sum);
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
+ 32, sse, sum);
}
unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
- variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
- return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
+ variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+ return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
+}
+
+unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+ variance_neon_w8(a + (32 * a_stride), a_stride,
+ b + (32 * b_stride), b_stride, 32, 32,
+ &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+ variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w8(a + (16 * a_stride), a_stride,
+ b + (16 * b_stride), b_stride, 64, 16,
+ &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum1, sum2;
+ uint32_t sse1, sse2;
+
+ variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w8(a + (16 * a_stride), a_stride,
+ b + (16 * b_stride), b_stride, 64, 16,
+ &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+ b + (16 * 2 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
+ sse1 += sse2;
+ sum1 += sum2;
+
+ variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+ b + (16 * 3 * b_stride), b_stride,
+ 64, 16, &sse2, &sum2);
+ *sse = sse1 + sse2;
+ sum1 += sum2;
+ return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
}
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@@ -215,13 +260,31 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);
- DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);
+ DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+ 33, 32,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
+ 32, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
- var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
- kHeight32PlusOne, kWidth32,
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+ 65, 64,
BILINEAR_FILTERS_2TAP(xoffset));
- var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,
- kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
- return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
+ var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
+ 64, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
index f7fca0cde0a..9622ba1d67e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -11,22 +11,34 @@
#include <limits.h>
#include <math.h>
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
#include "vp9/common/vp9_seg_common.h"
-
#include "vp9/encoder/vp9_segmentation.h"
-#define AQ_C_SEGMENTS 3
-#define AQ_C_STRENGTHS 3
-static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
- {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
+ { {1.75, 1.25, 1.05, 1.00, 0.90},
+ {2.00, 1.50, 1.15, 1.00, 0.85},
+ {2.50, 1.75, 1.25, 1.00, 0.80} };
static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
- {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
+ { {0.15, 0.30, 0.55, 2.00, 100.0},
+ {0.20, 0.40, 0.65, 2.00, 100.0},
+ {0.25, 0.50, 0.75, 2.00, 100.0} };
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+ { {-4.0, -3.0, -2.0, 100.00, 100.0},
+ {-3.5, -2.5, -1.5, 100.00, 100.0},
+ {-3.0, -2.0, -1.0, 100.00, 100.0} };
+
+#define DEFAULT_COMPLEXITY 64
+
static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
// Approximate base quatizer (truncated to int)
const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
- return (base_quant > 20) + (base_quant > 45);
+ return (base_quant > 10) + (base_quant > 25);
}
void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
@@ -41,13 +53,9 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
(cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
int segment;
const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
- const int active_segments = aq_c_active_segments[aq_strength];
// Clear down the segment map.
- vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-
- // Clear down the complexity map used for rd.
- vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+ memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
vp9_clearall_segfeatures(seg);
@@ -63,15 +71,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
// Select delta coding method.
seg->abs_delta = SEGMENT_DELTADATA;
- // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
- vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
// Use some of the segments for in frame Q adjustment.
- for (segment = 1; segment < active_segments; ++segment) {
- int qindex_delta =
- vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
- aq_c_q_adj_factor[aq_strength][segment],
- cm->bit_depth);
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG)
+ continue;
+
+ qindex_delta =
+ vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment],
+ cm->bit_depth);
+
// For AQ complexity mode, we dont allow Q0 in a segment if the base
// Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -88,55 +102,54 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
}
}
-// Select a segment for the current SB64 block.
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
// The choice of segment for a block depends on the ratio of the projected
-// bits for the block vs a target average.
-// An "aq_strength" value determines how many segments are supported,
-// the set of transition points to use and the extent of the quantizer
-// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
-void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
- int mi_row, int mi_col,
- int output_enabled, int projected_rate) {
+// bits for the block vs a target average and its spatial complexity.
+void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
VP9_COMMON *const cm = &cpi->common;
const int mi_offset = mi_row * cm->mi_cols + mi_col;
const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
- const int xmis = MIN(cm->mi_cols - mi_col, bw);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
- int complexity_metric = 64;
+ const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+ const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
int x, y;
-
+ int i;
unsigned char segment;
- if (!output_enabled) {
- segment = 0;
+ if (0) {
+ segment = DEFAULT_AQ2_SEG;
} else {
// Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
// It is converted to bits * 256 units.
const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
(bw * bh);
+ double logvar;
+ double low_var_thresh;
const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
- const int active_segments = aq_c_active_segments[aq_strength];
-
- // The number of segments considered and the transition points used to
- // select them is determined by the "aq_strength" value.
- // Currently this loop only supports segments that reduce Q (i.e. where
- // there is undershoot.
- // The loop counts down towards segment 0 which is the default segment
- // with no Q adjustment.
- segment = active_segments - 1;
- while (segment > 0) {
- if (projected_rate <
- (target_rate * aq_c_transitions[aq_strength][segment])) {
+
+ vp9_clear_system_state();
+ low_var_thresh = (cpi->oxcf.pass == 2)
+ ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
+
+ vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+ logvar = vp9_log_block_var(cpi, mb, bs);
+
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
+ if ((projected_rate <
+ target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
break;
}
- --segment;
- }
-
- if (target_rate > 0) {
- complexity_metric =
- clamp((int)((projected_rate * 64) / target_rate), 16, 255);
}
}
@@ -144,8 +157,6 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
for (y = 0; y < ymis; y++) {
for (x = 0; x < xmis; x++) {
cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
- cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
- (unsigned char)complexity_metric;
}
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
index af031a46c6c..c0dce6c5b7d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -17,11 +17,12 @@ extern "C" {
#endif
struct VP9_COMP;
+struct macroblock;
-// Select a segment for the current SB64.
-void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col,
- int output_enabled, int projected_rate);
-
+// Select a segment for the current Block.
+void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate);
// This function sets up a set of segments with delta Q values around
// the baseline frame quantizer.
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 514ff7a52ad..24b427df575 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -19,31 +19,38 @@
#include "vp9/encoder/vp9_segmentation.h"
struct CYCLIC_REFRESH {
- // Percentage of super-blocks per frame that are targeted as candidates
+ // Percentage of blocks per frame that are targeted as candidates
// for cyclic refresh.
- int max_sbs_perframe;
+ int percent_refresh;
// Maximum q-delta as percentage of base q.
int max_qdelta_perc;
- // Block size below which we don't apply cyclic refresh.
- BLOCK_SIZE min_block_size;
// Superblock starting index for cycling through the frame.
int sb_index;
- // Controls how long a block will need to wait to be refreshed again.
+ // Controls how long block will need to wait to be refreshed again, in
+ // excess of the cycle time, i.e., in the case of all zero motion, block
+ // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
int time_for_refresh;
- // Actual number of (8x8) blocks that were applied delta-q (segment 1).
- int num_seg_blocks;
- // Actual encoding bits for segment 1.
- int actual_seg_bits;
+ // Target number of (8x8) blocks that are set for delta-q.
+ int target_num_seg_blocks;
+ // Actual number of (8x8) blocks that were applied delta-q.
+ int actual_num_seg1_blocks;
+ int actual_num_seg2_blocks;
// RD mult. parameters for segment 1.
int rdmult;
// Cyclic refresh map.
signed char *map;
- // Projected rate and distortion for the current superblock.
- int64_t projected_rate_sb;
- int64_t projected_dist_sb;
- // Thresholds applied to projected rate/distortion of the superblock.
+ // Thresholds applied to the projected rate/distortion of the coding block,
+ // when deciding whether block should be refreshed.
int64_t thresh_rate_sb;
int64_t thresh_dist_sb;
+ // Threshold applied to the motion vector (in units of 1/8 pel) of the
+ // coding block, when deciding whether block should be refreshed.
+ int16_t motion_thresh;
+ // Rate target ratio to set q delta.
+ double rate_ratio_qdelta;
+ double low_content_avg;
+ int qindex_delta_seg1;
+ int qindex_delta_seg2;
};
CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
@@ -73,10 +80,10 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
// with number of seg blocks, so compare available bits to number of blocks.
// Average bits available per frame = avg_frame_bandwidth
// Number of (8x8) blocks in frame = mi_rows * mi_cols;
- const float factor = 0.5;
+ const float factor = 0.25;
const int number_blocks = cm->mi_rows * cm->mi_cols;
// The condition below corresponds to turning off at target bitrates:
- // ~24kbps for CIF, 72kbps for VGA (at 30fps).
+ // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
// Also turn off at very small frame sizes, to avoid too large fraction of
// superblocks to be refreshed per frame. Threshold below is less than QCIF.
if (rc->avg_frame_bandwidth < factor * number_blocks ||
@@ -92,33 +99,98 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
// mode, and rate/distortion.
static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
const MB_MODE_INFO *mbmi,
- BLOCK_SIZE bsize, int use_rd) {
- if (use_rd) {
- // If projected rate is below the thresh_rate (well below target,
- // so undershoot expected), accept it for lower-qp coding.
- if (cr->projected_rate_sb < cr->thresh_rate_sb)
- return 1;
- // Otherwise, reject the block for lower-qp coding if any of the following:
- // 1) prediction block size is below min_block_size
- // 2) mode is non-zero mv and projected distortion is above thresh_dist
- // 3) mode is an intra-mode (we may want to allow some of this under
- // another thresh_dist)
- else if (bsize < cr->min_block_size ||
- (mbmi->mv[0].as_int != 0 &&
- cr->projected_dist_sb > cr->thresh_dist_sb) ||
- !is_inter_block(mbmi))
- return 0;
- else
- return 1;
- } else {
- // Rate/distortion not used for update.
- if (bsize < cr->min_block_size ||
- mbmi->mv[0].as_int != 0 ||
- !is_inter_block(mbmi))
- return 0;
- else
- return 1;
+ int64_t rate,
+ int64_t dist,
+ int bsize) {
+ MV mv = mbmi->mv[0].as_mv;
+ // Reject the block for lower-qp coding if projected distortion
+ // is above the threshold, and any of the following is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ // Otherwise accept for refresh.
+ if (dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mbmi)))
+ return CR_SEGMENT_ID_BASE;
+ else if (bsize >= BLOCK_16X16 &&
+ rate < cr->thresh_rate_sb &&
+ is_inter_block(mbmi) &&
+ mbmi->mv[0].as_int == 0)
+ // More aggressive delta-q for bigger blocks with zero motion.
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) {
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type,
+ q, rate_factor,
+ cpi->common.bit_depth);
+ if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+ deltaq = -cr->max_qdelta_perc * q / 100;
}
+ return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
+ double correction_factor) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int estimated_bits;
+ int mbs = cm->MBs;
+ int num8x8bl = mbs << 2;
+ // Weight for non-base segments: use actual number of blocks refreshed in
+ // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+ // Take segment weighted average for estimated bits.
+ estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) *
+ vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+ correction_factor, cm->bit_depth) +
+ weight_segment1 *
+ vp9_estimate_bits_at_q(cm->frame_type,
+ cm->base_qindex + cr->qindex_delta_seg1, mbs,
+ correction_factor, cm->bit_depth) +
+ weight_segment2 *
+ vp9_estimate_bits_at_q(cm->frame_type,
+ cm->base_qindex + cr->qindex_delta_seg2, mbs,
+ correction_factor, cm->bit_depth));
+ return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
+ double correction_factor) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bits_per_mb;
+ int num8x8bl = cm->MBs << 2;
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ double weight_segment = (double)((cr->target_num_seg_blocks +
+ cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) /
+ num8x8bl;
+ // Compute delta-q corresponding to qindex i.
+ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ // Take segment weighted average for bits per mb.
+ bits_per_mb = (int)((1.0 - weight_segment) *
+ vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) +
+ weight_segment *
+ vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, correction_factor,
+ cm->bit_depth));
+ return bits_per_mb;
}
// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
@@ -127,7 +199,10 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
MB_MODE_INFO *const mbmi,
int mi_row, int mi_col,
- BLOCK_SIZE bsize, int use_rd) {
+ BLOCK_SIZE bsize,
+ int64_t rate,
+ int64_t dist,
+ int skip) {
const VP9_COMMON *const cm = &cpi->common;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -135,21 +210,26 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
const int xmis = MIN(cm->mi_cols - mi_col, bw);
const int ymis = MIN(cm->mi_rows - mi_row, bh);
const int block_index = mi_row * cm->mi_cols + mi_col;
- const int refresh_this_block = cpi->mb.in_static_area ||
- candidate_refresh_aq(cr, mbmi, bsize, use_rd);
+ const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
+ bsize);
// Default is to not update the refresh map.
int new_map_value = cr->map[block_index];
int x = 0; int y = 0;
- // Check if we should reset the segment_id for this block.
- if (mbmi->segment_id > 0 && !refresh_this_block)
- mbmi->segment_id = 0;
+ // If this block is labeled for refresh, check if we should reset the
+ // segment_id.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ mbmi->segment_id = refresh_this_block;
+ // Reset segment_id if will be skipped.
+ if (skip)
+ mbmi->segment_id = CR_SEGMENT_ID_BASE;
+ }
// Update the cyclic refresh map, to be used for setting segmentation map
// for the next frame. If the block will be refreshed this frame, mark it
// as clean. The magnitude of the -ve influences how long before we consider
// it for refresh again.
- if (mbmi->segment_id == 1) {
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
new_map_value = -cr->time_for_refresh;
} else if (refresh_this_block) {
// Else if it is accepted as candidate for refresh, and has not already
@@ -161,6 +241,7 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
// Leave it marked as block that is not candidate for refresh.
new_map_value = 1;
}
+
// Update entries in the cyclic refresh map with new_map_value, and
// copy mbmi->segment_id into global segmentation map.
for (y = 0; y < ymis; y++)
@@ -169,10 +250,188 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
mbmi->segment_id;
}
- // Keep track of actual number (in units of 8x8) of blocks in segment 1 used
- // for encoding this frame.
- if (mbmi->segment_id)
- cr->num_seg_blocks += xmis * ymis;
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int mi_row, mi_col;
+ cr->actual_num_seg1_blocks = 0;
+ cr->actual_num_seg2_blocks = 0;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ if (cyclic_refresh_segment_id(
+ seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST1)
+ cr->actual_num_seg1_blocks++;
+ else if (cyclic_refresh_segment_id(
+ seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST2)
+ cr->actual_num_seg2_blocks++;
+ }
+}
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+ // period. Depending on past encoding stats, GF flag may be reset and update
+ // may not occur until next baseline_gf_interval.
+ if (cr->percent_refresh > 0)
+ rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+ else
+ rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int mi_row, mi_col;
+ double fraction_low = 0.0;
+ int low_content_frame = 0;
+
+ MODE_INFO **mi = cm->mi_grid_visible;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int rows = cm->mi_rows, cols = cm->mi_cols;
+ int cnt1 = 0, cnt2 = 0;
+ int force_gf_refresh = 0;
+
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 ?
+ mi[0]->mbmi.mv[0].as_mv.row : -1 * mi[0]->mbmi.mv[0].as_mv.row;
+ int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 ?
+ mi[0]->mbmi.mv[0].as_mv.col : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+
+ // Calculate the motion of the background.
+ if (abs_mvr <= 16 && abs_mvc <= 16) {
+ cnt1++;
+ if (abs_mvr == 0 && abs_mvc == 0)
+ cnt2++;
+ }
+ mi++;
+
+ // Accumulate low_content_frame.
+ if (cr->map[mi_row * cols + mi_col] < 1)
+ low_content_frame++;
+ }
+ mi += 8;
+ }
+
+ // For video conference clips, if the background has high motion in current
+ // frame because of the camera movement, set this frame as the golden frame.
+ // Use 70% and 5% as the thresholds for golden frame refreshing.
+ if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
+ vp9_cyclic_refresh_set_golden_update(cpi);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ force_gf_refresh = 1;
+ }
+
+ fraction_low =
+ (double)low_content_frame / (rows * cols);
+ // Update average.
+ cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+ if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+ // Don't update golden reference if the amount of low_content for the
+ // current encoded frame is small, or if the recursive average of the
+ // low_content over the update interval window falls below threshold.
+ if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+ cpi->refresh_golden_frame = 0;
+ // Reset for next internal.
+ cr->low_content_avg = fraction_low;
+ }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+ int xmis, ymis, x, y;
+ memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+ sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+ sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+ sbs_in_frame = sb_cols * sb_rows;
+ // Number of target blocks to get the q delta (segment 1).
+ block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ // Set the segmentation map: cycle through the superblocks, starting at
+ // cr->mb_index, and stopping when either block_count blocks have been found
+ // to be refreshed, or we have passed through whole frame.
+ assert(cr->sb_index < sbs_in_frame);
+ i = cr->sb_index;
+ cr->target_num_seg_blocks = 0;
+ do {
+ int sum_map = 0;
+ // Get the mi_row/mi_col corresponding to superblock index i.
+ int sb_row_index = (i / sb_cols);
+ int sb_col_index = i - sb_row_index * sb_cols;
+ int mi_row = sb_row_index * MI_BLOCK_SIZE;
+ int mi_col = sb_col_index * MI_BLOCK_SIZE;
+ assert(mi_row >= 0 && mi_row < cm->mi_rows);
+ assert(mi_col >= 0 && mi_col < cm->mi_cols);
+ bl_index = mi_row * cm->mi_cols + mi_col;
+ // Loop through all 8x8 blocks in superblock and update map.
+ xmis = MIN(cm->mi_cols - mi_col,
+ num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+ ymis = MIN(cm->mi_rows - mi_row,
+ num_8x8_blocks_high_lookup[BLOCK_64X64]);
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ const int bl_index2 = bl_index + y * cm->mi_cols + x;
+ // If the block is as a candidate for clean up then mark it
+ // for possible boost/refresh (segment 1). The segment id may get
+ // reset to 0 later if block gets coded anything other than ZEROMV.
+ if (cr->map[bl_index2] == 0) {
+ sum_map++;
+ } else if (cr->map[bl_index2] < 0) {
+ cr->map[bl_index2]++;
+ }
+ }
+ }
+ // Enforce constant segment over superblock.
+ // If segment is at least half of superblock, set to 1.
+ if (sum_map >= xmis * ymis / 2) {
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+ }
+ cr->target_num_seg_blocks += xmis * ymis;
+ }
+ i++;
+ if (i == sbs_in_frame) {
+ i = 0;
+ }
+ } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+ cr->sb_index = i;
+}
+
+// Set/update global/frame level cyclic refresh parameters.
+void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ cr->percent_refresh = 10;
+ // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+ // periods of the refresh cycle, after a key frame. This corresponds to ~40
+ // frames with cr->percent_refresh = 10.
+ if (rc->frames_since_key < 40)
+ cr->rate_ratio_qdelta = 3.0;
+ else
+ cr->rate_ratio_qdelta = 2.0;
}
// Setup cyclic background refresh: set delta q and segmentation map.
@@ -181,47 +440,38 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
struct segmentation *const seg = &cm->seg;
- unsigned char *const seg_map = cpi->segmentation_map;
const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+ if (cm->current_video_frame == 0)
+ cr->low_content_avg = 0.0;
// Don't apply refresh on key frame or enhancement layer frames.
if (!apply_cyclic_refresh ||
(cm->frame_type == KEY_FRAME) ||
- (cpi->svc.temporal_layer_id > 0)) {
+ (cpi->svc.temporal_layer_id > 0) ||
+ (cpi->svc.spatial_layer_id > 0)) {
// Set segmentation map to 0 and disable.
- vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+ unsigned char *const seg_map = cpi->segmentation_map;
+ memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
vp9_disable_segmentation(&cm->seg);
if (cm->frame_type == KEY_FRAME)
cr->sb_index = 0;
return;
} else {
int qindex_delta = 0;
- int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
- int xmis, ymis, x, y, qindex2;
-
- // Rate target ratio to set q delta.
- const float rate_ratio_qdelta = 2.0;
+ int qindex2;
const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
vp9_clear_system_state();
- // Some of these parameters may be set via codec-control function later.
- cr->max_sbs_perframe = 10;
cr->max_qdelta_perc = 50;
- cr->min_block_size = BLOCK_8X8;
- cr->time_for_refresh = 1;
- // Set rate threshold to some fraction of target (and scaled by 256).
- cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
+ cr->time_for_refresh = 0;
+ // Set rate threshold to some multiple (set to 2 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
// Distortion threshold, quadratic in Q, scale factor to be adjusted.
- cr->thresh_dist_sb = 8 * (int)(q * q);
- if (cpi->sf.use_nonrd_pick_mode) {
- // May want to be more conservative with thresholds in non-rd mode for now
- // as rate/distortion are derived from model based on prediction residual.
- cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3;
- cr->thresh_dist_sb = 4 * (int)(q * q);
- }
-
- cr->num_seg_blocks = 0;
+ // q will not exceed 457, so (q * q) is within 32bit; see:
+ // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
+ cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+ cr->motion_thresh = 32;
// Set up segmentation.
// Clear down the segment map.
- vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
vp9_enable_segmentation(&cm->seg);
vp9_clearall_segfeatures(seg);
// Select delta coding method.
@@ -234,89 +484,34 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
// relative to 0 previous map.
// seg->temporal_update = 0;
- // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
- vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
- // Use segment 1 for in-frame Q adjustment.
- vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
- // Set the q delta for segment 1.
- qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
- cm->base_qindex,
- rate_ratio_qdelta,
- cm->bit_depth);
- // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
- // previous encoded frame.
- if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
- qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100;
-
- // Compute rd-mult for segment 1.
+ // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+ vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+ // Use segment BOOST1 for in-frame Q adjustment.
+ vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+ // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+ vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+ // Set the q delta for segment BOOST1.
+ qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+ cr->qindex_delta_seg1 = qindex_delta;
+
+ // Compute rd-mult for segment BOOST1.
qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
- vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta);
-
- sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
- sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
- sbs_in_frame = sb_cols * sb_rows;
- // Number of target superblocks to get the q delta (segment 1).
- block_count = cr->max_sbs_perframe * sbs_in_frame / 100;
- // Set the segmentation map: cycle through the superblocks, starting at
- // cr->mb_index, and stopping when either block_count blocks have been found
- // to be refreshed, or we have passed through whole frame.
- assert(cr->sb_index < sbs_in_frame);
- i = cr->sb_index;
- do {
- int sum_map = 0;
- // Get the mi_row/mi_col corresponding to superblock index i.
- int sb_row_index = (i / sb_cols);
- int sb_col_index = i - sb_row_index * sb_cols;
- int mi_row = sb_row_index * MI_BLOCK_SIZE;
- int mi_col = sb_col_index * MI_BLOCK_SIZE;
- assert(mi_row >= 0 && mi_row < cm->mi_rows);
- assert(mi_col >= 0 && mi_col < cm->mi_cols);
- bl_index = mi_row * cm->mi_cols + mi_col;
- // Loop through all 8x8 blocks in superblock and update map.
- xmis = MIN(cm->mi_cols - mi_col,
- num_8x8_blocks_wide_lookup[BLOCK_64X64]);
- ymis = MIN(cm->mi_rows - mi_row,
- num_8x8_blocks_high_lookup[BLOCK_64X64]);
- for (y = 0; y < ymis; y++) {
- for (x = 0; x < xmis; x++) {
- const int bl_index2 = bl_index + y * cm->mi_cols + x;
- // If the block is as a candidate for clean up then mark it
- // for possible boost/refresh (segment 1). The segment id may get
- // reset to 0 later if block gets coded anything other than ZEROMV.
- if (cr->map[bl_index2] == 0) {
- seg_map[bl_index2] = 1;
- sum_map++;
- } else if (cr->map[bl_index2] < 0) {
- cr->map[bl_index2]++;
- }
- }
- }
- // Enforce constant segment over superblock.
- // If segment is partial over superblock, reset to either all 1 or 0.
- if (sum_map > 0 && sum_map < xmis * ymis) {
- const int new_value = (sum_map >= xmis * ymis / 2);
- for (y = 0; y < ymis; y++)
- for (x = 0; x < xmis; x++)
- seg_map[bl_index + y * cm->mi_cols + x] = new_value;
- }
- i++;
- if (i == sbs_in_frame) {
- i = 0;
- }
- if (sum_map >= xmis * ymis /2)
- block_count--;
- } while (block_count && i != cr->sb_index);
- cr->sb_index = i;
- }
-}
+ vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
-void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
- int64_t rate_sb, int64_t dist_sb) {
- cr->projected_rate_sb = rate_sb;
- cr->projected_dist_sb = dist_sb;
+ // Set a more aggressive (higher) q delta for segment BOOST2.
+ qindex_delta = compute_deltaq(cpi, cm->base_qindex,
+ MIN(CR_MAX_RATE_TARGET_RATIO,
+ CR_BOOST2_FAC * cr->rate_ratio_qdelta));
+ cr->qindex_delta_seg2 = qindex_delta;
+ vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Update the segmentation and refresh map.
+ vp9_cyclic_refresh_update_map(cpi);
+ }
}
int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index f556d658bdc..21f114b5e59 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -18,6 +18,18 @@
extern "C" {
#endif
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+// Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+#define CR_BOOST2_FAC 1.7
+
struct VP9_COMP;
struct CYCLIC_REFRESH;
@@ -27,22 +39,59 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int vp9_cyclic_refresh_estimate_bits_at_q(const struct VP9_COMP *cpi,
+ double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
+ double correction_factor);
+
// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
// check if we should reset the segment_id, and update the cyclic_refresh map
// and segmentation map.
void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
MB_MODE_INFO *const mbmi,
- int mi_row, int mi_col,
- BLOCK_SIZE bsize, int use_rd);
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
// Setup cyclic background refresh: set delta q and segmentation map.
void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
-void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
- int64_t rate_sb, int64_t dist_sb);
-
int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+ return segment_id == CR_SEGMENT_ID_BOOST1 ||
+ segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+ if (segment_id == CR_SEGMENT_ID_BOOST1)
+ return CR_SEGMENT_ID_BOOST1;
+ else if (segment_id == CR_SEGMENT_ID_BOOST2)
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BASE;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
index 7d75f09a418..be6f7e4ee53 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -19,18 +19,16 @@
#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/common/vp9_systemdependent.h"
-#define ENERGY_MIN (-1)
+#define ENERGY_MIN (-4)
#define ENERGY_MAX (1)
#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
#define ENERGY_IN_BOUNDS(energy)\
assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
-static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
+static const double rate_ratio[MAX_SEGMENTS] =
+ {2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
-#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN]
-#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN]
#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
@@ -40,47 +38,12 @@ DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
unsigned int vp9_vaq_segment_id(int energy) {
ENERGY_IN_BOUNDS(energy);
-
return SEGMENT_ID(energy);
}
-double vp9_vaq_rdmult_ratio(int energy) {
- ENERGY_IN_BOUNDS(energy);
-
- vp9_clear_system_state();
-
- return RDMULT_RATIO(energy);
-}
-
-double vp9_vaq_inv_q_ratio(int energy) {
- ENERGY_IN_BOUNDS(energy);
-
- vp9_clear_system_state();
-
- return Q_RATIO(-energy);
-}
-
-void vp9_vaq_init() {
- int i;
- double base_ratio;
-
- assert(ENERGY_SPAN <= MAX_SEGMENTS);
-
- vp9_clear_system_state();
-
- base_ratio = 1.5;
-
- for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
- Q_RATIO(i) = pow(base_ratio, i/3.0);
- }
-}
-
void vp9_vaq_frame_setup(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
struct segmentation *seg = &cm->seg;
- const double base_q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
- const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
- cm->y_dc_delta_q);
int i;
if (cm->frame_type == KEY_FRAME ||
@@ -91,26 +54,28 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) {
seg->abs_delta = SEGMENT_DELTADATA;
- vp9_clear_system_state();
+ vp9_clear_system_state();
- for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
- int qindex_delta, segment_rdmult;
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ int qindex_delta =
+ vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+ rate_ratio[i], cm->bit_depth);
- if (Q_RATIO(i) == 1) {
- // No need to enable SEG_LVL_ALT_Q for this segment
- RDMULT_RATIO(i) = 1;
- continue;
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
}
- qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i),
- cm->bit_depth);
- vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
- vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
-
- segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
- cm->y_dc_delta_q);
+ // No need to enable SEG_LVL_ALT_Q for this segment.
+ if (rate_ratio[i] == 1.0) {
+ continue;
+ }
- RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
}
}
}
@@ -167,12 +132,19 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
}
}
-int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
- double energy;
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
unsigned int var = block_variance(cpi, x, bs);
-
vp9_clear_system_state();
+ return log(var + 1.0);
+}
- energy = 0.9 * (log(var + 1.0) - 10.0);
+#define DEFAULT_E_MIDPOINT 10.0
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ double energy;
+ double energy_midpoint;
+ vp9_clear_system_state();
+ energy_midpoint =
+ (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+ energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint;
return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
index d1a459fe9ec..a0effa31165 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -19,13 +19,10 @@ extern "C" {
#endif
unsigned int vp9_vaq_segment_id(int energy);
-double vp9_vaq_rdmult_ratio(int energy);
-double vp9_vaq_inv_q_ratio(int energy);
-
-void vp9_vaq_init();
void vp9_vaq_frame_setup(VP9_COMP *cpi);
int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c
index e9810c894d3..95b13bb7718 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c
@@ -19,6 +19,156 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
return (sum + 32) >> 6;
}
+unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 4; ++i, s+=p)
+ for (j = 0; j < 4; sum += s[j], ++j) {}
+
+ return (sum + 8) >> 4;
+}
+
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf);
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, coeff);
+ coeff += 8;
+ ++tmp_buf;
+ }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ + (idx & 0x01) * 8;
+ vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; ++idx) {
+ int16_t a0 = coeff[0];
+ int16_t a1 = coeff[64];
+ int16_t a2 = coeff[128];
+ int16_t a3 = coeff[192];
+
+ int16_t b0 = a0 + a1;
+ int16_t b1 = a0 - a1;
+ int16_t b2 = a2 + a3;
+ int16_t b3 = a2 - a3;
+
+ coeff[0] = (b0 + b2) >> 1;
+ coeff[64] = (b1 + b3) >> 1;
+ coeff[128] = (b0 - b2) >> 1;
+ coeff[192] = (b1 - b3) >> 1;
+
+ ++coeff;
+ }
+}
+
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i)
+ satd += abs(coeff[i]);
+
+ return (int16_t)satd;
+}
+
+// Integer projection onto row vectors.
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
+ const int ref_stride, const int height) {
+ int idx;
+ const int norm_factor = MAX(8, height >> 1);
+ for (idx = 0; idx < 16; ++idx) {
+ int i;
+ hbuf[idx] = 0;
+ for (i = 0; i < height; ++i)
+ hbuf[idx] += ref[i * ref_stride];
+ hbuf[idx] /= norm_factor;
+ ++ref;
+ }
+}
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
+ int idx;
+ int16_t sum = 0;
+ for (idx = 0; idx < width; ++idx)
+ sum += ref[idx];
+ return sum;
+}
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+ const int bwl) {
+ int i;
+ int width = 4 << bwl;
+ int sse = 0, mean = 0, var;
+
+ for (i = 0; i < width; ++i) {
+ int diff = ref[i] - src[i];
+ mean += diff;
+ sse += diff * diff;
+ }
+
+ var = sse - ((mean * mean) >> (bwl + 2));
+ return var;
+}
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ int i, j;
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j]-d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+
#if CONFIG_VP9_HIGHBITDEPTH
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
int i, j;
@@ -29,5 +179,32 @@ unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
return (sum + 32) >> 6;
}
+
+unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+ int i, j;
+ int sum = 0;
+ const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 4; ++i, s+=p)
+ for (j = 0; j < 4; sum += s[j], ++j) {}
+
+ return (sum + 8) >> 4;
+}
+
+void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ int i, j;
+ const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j]-d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
#endif // CONFIG_VP9_HIGHBITDEPTH
+
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
index 421e049697a..d20e067669f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
@@ -34,17 +34,15 @@
#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/encoder/vp9_write_bit_buffer.h"
-static struct vp9_token intra_mode_encodings[INTRA_MODES];
-static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
-static struct vp9_token partition_encodings[PARTITION_TYPES];
-static struct vp9_token inter_mode_encodings[INTER_MODES];
-
-void vp9_entropy_mode_init() {
- vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
- vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
- vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
- vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
-}
+static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
+ {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
+ {62, 6}, {2, 2}};
+static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+ {{0, 1}, {2, 2}, {3, 2}};
+static const struct vp9_token partition_encodings[PARTITION_TYPES] =
+ {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+static const struct vp9_token inter_mode_encodings[INTER_MODES] =
+ {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
const vp9_prob *probs) {
@@ -79,12 +77,12 @@ static void prob_diff_update(const vp9_tree_index *tree,
}
static void write_selected_tx_size(const VP9_COMMON *cm,
- const MACROBLOCKD *xd,
- TX_SIZE tx_size, BLOCK_SIZE bsize,
- vp9_writer *w) {
+ const MACROBLOCKD *xd, vp9_writer *w) {
+ TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
+ BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
- &cm->fc.tx_probs);
+ &cm->fc->tx_probs);
vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -104,19 +102,21 @@ static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
}
}
-static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w,
+ FRAME_COUNTS *counts) {
int k;
for (k = 0; k < SKIP_CONTEXTS; ++k)
- vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
+ vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
}
-static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w,
+ FRAME_COUNTS *counts) {
int j;
for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
prob_diff_update(vp9_switchable_interp_tree,
- cm->fc.switchable_interp_prob[j],
- cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
+ cm->fc->switchable_interp_prob[j],
+ counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
}
static void pack_mb_tokens(vp9_writer *w,
@@ -201,7 +201,7 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
// This function encodes the reference frame
static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_writer *w) {
- const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const int is_compound = has_second_ref(mbmi);
const int segment_id = mbmi->segment_id;
@@ -237,8 +237,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
vp9_writer *w) {
VP9_COMMON *const cm = &cpi->common;
- const nmv_context *nmvc = &cm->fc.nmvc;
- const MACROBLOCK *const x = &cpi->mb;
+ const nmv_context *nmvc = &cm->fc->nmvc;
+ const MACROBLOCK *const x = &cpi->td.mb;
const MACROBLOCKD *const xd = &x->e_mbd;
const struct segmentation *const seg = &cm->seg;
const MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -268,14 +268,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
- !(is_inter &&
- (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
- write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
+ !(is_inter && skip)) {
+ write_selected_tx_size(cm, xd, w);
}
if (!is_inter) {
if (bsize >= BLOCK_8X8) {
- write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+ write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
} else {
int idx, idy;
const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -283,28 +282,27 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
for (idy = 0; idy < 2; idy += num_4x4_h) {
for (idx = 0; idx < 2; idx += num_4x4_w) {
const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
- write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
+ write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
}
}
}
- write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+ write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
} else {
const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
- const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
+ const vp9_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
write_ref_frames(cm, xd, w);
// If segment skip is not enabled code the mode.
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
if (bsize >= BLOCK_8X8) {
write_inter_mode(w, mode, inter_probs);
- ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)];
}
}
if (cm->interp_filter == SWITCHABLE) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
vp9_write_token(w, vp9_switchable_interp_tree,
- cm->fc.switchable_interp_prob[ctx],
+ cm->fc->switchable_interp_prob[ctx],
&switchable_interp_encodings[mbmi->interp_filter]);
++cpi->interp_filter_selected[0][mbmi->interp_filter];
} else {
@@ -320,7 +318,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
const int j = idy * 2 + idx;
const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
write_inter_mode(w, b_mode, inter_probs);
- ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
if (b_mode == NEWMV) {
for (ref = 0; ref < 1 + is_compound; ++ref)
vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@@ -341,12 +338,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
}
static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
- MODE_INFO *mi_8x8, vp9_writer *w) {
+ MODE_INFO **mi_8x8, vp9_writer *w) {
const struct segmentation *const seg = &cm->seg;
- const MODE_INFO *const mi = mi_8x8;
- const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride].src_mi;
- const MODE_INFO *const left_mi =
- xd->left_available ? mi_8x8[-1].src_mi : NULL;
+ const MODE_INFO *const mi = mi_8x8[0];
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
const MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -356,7 +352,7 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
write_skip(cm, xd, mbmi->segment_id, mi, w);
if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
- write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
+ write_selected_tx_size(cm, xd, w);
if (bsize >= BLOCK_8X8) {
write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
@@ -382,11 +378,11 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
const TOKENEXTRA *const tok_end,
int mi_row, int mi_col) {
const VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
MODE_INFO *m;
- xd->mi = cm->mi + (mi_row * cm->mi_stride + mi_col);
- m = xd->mi;
+ xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+ m = xd->mi[0];
set_mi_row_col(xd, tile,
mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
@@ -429,7 +425,7 @@ static void write_modes_sb(VP9_COMP *cpi,
TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
int mi_row, int mi_col, BLOCK_SIZE bsize) {
const VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
const int bsl = b_width_log2_lookup[bsize];
const int bs = (1 << bsl) / 4;
@@ -440,7 +436,7 @@ static void write_modes_sb(VP9_COMP *cpi,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- m = cm->mi[mi_row * cm->mi_stride + mi_col].src_mi;
+ m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
partition = partition_lookup[bsl][m->mbmi.sb_type];
write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
@@ -485,11 +481,12 @@ static void write_modes_sb(VP9_COMP *cpi,
static void write_modes(VP9_COMP *cpi,
const TileInfo *const tile, vp9_writer *w,
TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
int mi_row, mi_col;
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
- vp9_zero(cpi->mb.e_mbd.left_seg_context);
+ vp9_zero(xd->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE)
write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
@@ -500,7 +497,7 @@ static void write_modes(VP9_COMP *cpi,
static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
vp9_coeff_stats *coef_branch_ct,
vp9_coeff_probs_model *coef_probs) {
- vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
+ vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
cpi->common.counts.eob_branch[tx_size];
int i, j, k, l, m;
@@ -528,10 +525,12 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
TX_SIZE tx_size,
vp9_coeff_stats *frame_branch_ct,
vp9_coeff_probs_model *new_coef_probs) {
- vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size];
+ vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
const vp9_prob upd = DIFF_UPDATE_PROB;
const int entropy_nodes_update = UNCONSTRAINED_NODES;
int i, j, k, l, t;
+ int stepsize = cpi->sf.coeff_prob_appx_step;
+
switch (cpi->sf.use_fast_coef_updates) {
case TWO_LOOP: {
/* dry run to see if there is any update at all needed */
@@ -549,7 +548,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
if (t == PIVOT_NODE)
s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_coef_probs[i][j][k][l], &newp, upd);
+ old_coef_probs[i][j][k][l], &newp, upd, stepsize);
else
s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -587,7 +586,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
if (t == PIVOT_NODE)
s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_coef_probs[i][j][k][l], &newp, upd);
+ old_coef_probs[i][j][k][l], &newp, upd, stepsize);
else
s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t],
@@ -608,14 +607,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
return;
}
- case ONE_LOOP:
case ONE_LOOP_REDUCED: {
- const int prev_coef_contexts_to_update =
- cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
- COEFF_CONTEXTS >> 1 : COEFF_CONTEXTS;
- const int coef_band_to_update =
- cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
- COEF_BANDS >> 1 : COEF_BANDS;
int updates = 0;
int noupdates_before_first = 0;
for (i = 0; i < PLANE_TYPES; ++i) {
@@ -628,21 +620,19 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
int s;
int u = 0;
- if (l >= prev_coef_contexts_to_update ||
- k >= coef_band_to_update) {
- u = 0;
+
+ if (t == PIVOT_NODE) {
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_coef_probs[i][j][k][l], &newp, upd, stepsize);
} else {
- if (t == PIVOT_NODE)
- s = vp9_prob_diff_update_savings_search_model(
- frame_branch_ct[i][j][k][l][0],
- old_coef_probs[i][j][k][l], &newp, upd);
- else
- s = vp9_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t],
- *oldp, &newp, upd);
- if (s > 0 && newp != *oldp)
- u = 1;
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t],
+ *oldp, &newp, upd);
}
+
+ if (s > 0 && newp != *oldp)
+ u = 1;
updates += u;
if (u == 0 && updates == 0) {
noupdates_before_first++;
@@ -671,7 +661,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
}
return;
}
-
default:
assert(0);
}
@@ -681,16 +670,19 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
const TX_MODE tx_mode = cpi->common.tx_mode;
const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
TX_SIZE tx_size;
- vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
- vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-
- for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
- build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
- frame_coef_probs[tx_size]);
-
- for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
- frame_coef_probs[tx_size]);
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+ vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+ vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+ if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+ (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+ vp9_write_bit(w, 0);
+ } else {
+ build_tree_distribution(cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ }
+ }
}
static void encode_loopfilter(struct loopfilter *lf,
@@ -813,7 +805,8 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
}
}
-static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w,
+ FRAME_COUNTS *counts) {
// Mode
vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
if (cm->tx_mode >= ALLOW_32X32)
@@ -828,22 +821,22 @@ static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
+ tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
for (j = 0; j < TX_SIZES - 3; j++)
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
+ vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
+ tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
for (j = 0; j < TX_SIZES - 2; j++)
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
+ vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
ct_16x16p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
+ tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
for (j = 0; j < TX_SIZES - 1; j++)
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
+ vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
ct_32x32p[j]);
}
}
@@ -858,7 +851,7 @@ static void write_interp_filter(INTERP_FILTER filter,
vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
}
-static void fix_interp_filter(VP9_COMMON *cm) {
+static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
if (cm->interp_filter == SWITCHABLE) {
// Check to see if only one of the filters is actually used
int count[SWITCHABLE_FILTERS];
@@ -866,7 +859,7 @@ static void fix_interp_filter(VP9_COMMON *cm) {
for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
count[i] = 0;
for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
- count[i] += cm->counts.switchable_interp[j][i];
+ count[i] += counts->switchable_interp[j][i];
c += (count[i] > 0);
}
if (c == 1) {
@@ -929,42 +922,31 @@ static int get_refresh_mask(VP9_COMP *cpi) {
static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
VP9_COMMON *const cm = &cpi->common;
vp9_writer residual_bc;
-
int tile_row, tile_col;
- TOKENEXTRA *tok[4][1 << 6], *tok_end;
+ TOKENEXTRA *tok_end;
size_t total_size = 0;
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
- TileInfo tile[4][1 << 6];
- TOKENEXTRA *pre_tok = cpi->tok;
- int tile_tok = 0;
- vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
- mi_cols_aligned_to_sb(cm->mi_cols));
-
- for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
-
- tok[tile_row][tile_col] = pre_tok + tile_tok;
- pre_tok = tok[tile_row][tile_col];
- tile_tok = allocated_tokens(tile[tile_row][tile_col]);
- }
- }
+ memset(cm->above_seg_context, 0,
+ sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- const TileInfo * const ptile = &tile[tile_row][tile_col];
+ int tile_idx = tile_row * tile_cols + tile_col;
+ TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
- tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
+ tok_end = cpi->tile_tok[tile_row][tile_col] +
+ cpi->tok_count[tile_row][tile_col];
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
else
vp9_start_encode(&residual_bc, data_ptr + total_size);
- write_modes(cpi, ptile, &residual_bc, &tok[tile_row][tile_col], tok_end);
- assert(tok[tile_row][tile_col] == tok_end);
+ write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
+ &residual_bc, &tok, tok_end);
+ assert(tok == tok_end);
vp9_stop_encode(&residual_bc);
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
// size of this tile
@@ -1006,8 +988,6 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
- found = cm->width == cfg->y_crop_width &&
- cm->height == cfg->y_crop_height;
// Set "found" to 0 for temporal svc and for spatial svc key frame
if (cpi->use_svc &&
@@ -1020,6 +1000,9 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
cpi->svc.layer_context[0].frames_from_key_frame <
cpi->svc.number_temporal_layers + 1))) {
found = 0;
+ } else if (cfg != NULL) {
+ found = cm->width == cfg->y_crop_width &&
+ cm->height == cfg->y_crop_height;
}
vp9_wb_write_bit(wb, found);
if (found) {
@@ -1068,7 +1051,7 @@ static void write_bitdepth_colorspace_sampling(
vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
}
vp9_wb_write_literal(wb, cm->color_space, 3);
- if (cm->color_space != SRGB) {
+ if (cm->color_space != VPX_CS_SRGB) {
vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
@@ -1087,6 +1070,7 @@ static void write_bitdepth_colorspace_sampling(
static void write_uncompressed_header(VP9_COMP *cpi,
struct vp9_write_bit_buffer *wb) {
VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
@@ -1130,7 +1114,8 @@ static void write_uncompressed_header(VP9_COMP *cpi,
MV_REFERENCE_FRAME ref_frame;
vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- vp9_wb_write_literal(wb, get_ref_frame_idx(cpi, ref_frame),
+ assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+ vp9_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
REF_FRAMES_LOG2);
vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
}
@@ -1139,7 +1124,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
- fix_interp_filter(cm);
+ fix_interp_filter(cm, cpi->td.counts);
write_interp_filter(cm->interp_filter, wb);
}
}
@@ -1153,15 +1138,16 @@ static void write_uncompressed_header(VP9_COMP *cpi,
encode_loopfilter(&cm->lf, wb);
encode_quantization(cm, wb);
- encode_segmentation(cm, &cpi->mb.e_mbd, wb);
+ encode_segmentation(cm, xd, wb);
write_tile_info(cm, wb);
}
static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- FRAME_CONTEXT *const fc = &cm->fc;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ FRAME_CONTEXT *const fc = cm->fc;
+ FRAME_COUNTS *counts = cpi->td.counts;
vp9_writer header_bc;
vp9_start_encode(&header_bc, data);
@@ -1169,28 +1155,26 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (xd->lossless)
cm->tx_mode = ONLY_4X4;
else
- encode_txfm_probs(cm, &header_bc);
+ encode_txfm_probs(cm, &header_bc, counts);
update_coef_probs(cpi, &header_bc);
- update_skip_probs(cm, &header_bc);
+ update_skip_probs(cm, &header_bc, counts);
if (!frame_is_intra_only(cm)) {
int i;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
- prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
- cm->counts.inter_mode[i], INTER_MODES, &header_bc);
-
- vp9_zero(cm->counts.inter_mode);
+ prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
+ counts->inter_mode[i], INTER_MODES, &header_bc);
if (cm->interp_filter == SWITCHABLE)
- update_switchable_interp_probs(cm, &header_bc);
+ update_switchable_interp_probs(cm, &header_bc, counts);
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
- cm->counts.intra_inter[i]);
+ counts->intra_inter[i]);
- if (cm->allow_comp_inter_inter) {
+ if (cpi->allow_comp_inter_inter) {
const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
@@ -1200,33 +1184,34 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (use_hybrid_pred)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
- cm->counts.comp_inter[i]);
+ counts->comp_inter[i]);
}
}
if (cm->reference_mode != COMPOUND_REFERENCE) {
for (i = 0; i < REF_CONTEXTS; i++) {
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
- cm->counts.single_ref[i][0]);
+ counts->single_ref[i][0]);
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
- cm->counts.single_ref[i][1]);
+ counts->single_ref[i][1]);
}
}
if (cm->reference_mode != SINGLE_REFERENCE)
for (i = 0; i < REF_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
- cm->counts.comp_ref[i]);
+ counts->comp_ref[i]);
for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
- prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i],
- cm->counts.y_mode[i], INTRA_MODES, &header_bc);
+ prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
+ counts->y_mode[i], INTRA_MODES, &header_bc);
for (i = 0; i < PARTITION_CONTEXTS; ++i)
prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
- cm->counts.partition[i], PARTITION_TYPES, &header_bc);
+ counts->partition[i], PARTITION_TYPES, &header_bc);
- vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
+ vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+ &counts->mv);
}
vp9_stop_encode(&header_bc);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
index b48826140f9..da6b4146422 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
@@ -18,8 +18,6 @@ extern "C" {
#include "vp9/encoder/vp9_encoder.h"
-void vp9_entropy_mode_init();
-
void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
@@ -29,7 +27,7 @@ static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
(is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id == 0 &&
cpi->svc.layer_context[0].gold_ref_idx >=0 &&
- cpi->oxcf.ss_play_alternate[0]));
+ cpi->oxcf.ss_enable_auto_arf[0]));
}
#ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
index 5194c4c276b..04a1b8f3c2f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
@@ -40,8 +40,6 @@ struct macroblock_plane {
int16_t *round;
int64_t quant_thred[2];
- // Zbin Over Quant value
- int16_t zbin_extra;
};
/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
@@ -67,6 +65,11 @@ struct macroblock {
int rdmult;
int mb_energy;
+ // These are set to their default values at the beginning, and then adjusted
+ // further in the encoding process.
+ BLOCK_SIZE min_partition_size;
+ BLOCK_SIZE max_partition_size;
+
int mv_best_ref_index[MAX_REF_FRAMES];
unsigned int max_mv_context[MAX_REF_FRAMES];
unsigned int source_variance;
@@ -98,8 +101,6 @@ struct macroblock {
// note that token_costs is the cost when eob node is skipped
vp9_coeff_cost token_costs[TX_SIZES];
- int in_static_area;
-
int optimize;
// indicate if it is in the rd search loop or encoding process
@@ -117,6 +118,10 @@ struct macroblock {
// Used to store sub partition's choices.
MV pred_mv[MAX_REF_FRAMES];
+ // Strong color activity detection. Used in RTC coding mode to enhance
+ // the visual quality at the boundary of moving color objects.
+ uint8_t color_sensitivity[2];
+
void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c
new file mode 100644
index 00000000000..b8629bd3bb5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static int horizontal_filter(const uint8_t *s) {
+ return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+ return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+ return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+// p0 p1 p2 p3
+// q0 q1 q2 q3
+// block edge ->
+// r0 r1 r2 r3
+// s0 s1 s2 s3
+
+// blockiness = p0*-2+q0*6+r0*-6+s0*2 +
+// p1*-2+q1*6+r1*-6+s1*2 +
+// p2*-2+q2*6+r2*-6+s2*2 +
+// p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+// blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, s += sp, r += rp) {
+ s_blockiness += horizontal_filter(s);
+ r_blockiness += horizontal_filter(r);
+ sum_0 += s[0];
+ sum_sq_0 += s[0]*s[0];
+ sum_1 += s[-1];
+ sum_sq_1 += s[-1]*s[-1];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, ++s, ++r) {
+ s_blockiness += vertical_filter(s, sp);
+ r_blockiness += vertical_filter(r, rp);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-sp];
+ sum_sq_1 += s[-sp] * s[-sp];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch,
+ int width, int height ) {
+ double blockiness = 0;
+ int i, j;
+ vp9_clear_system_state();
+ for (i = 0; i < height; i += 4, img1 += img1_pitch * 4,
+ img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4) {
+ if (i > 0 && i < height && j > 0 && j < width) {
+ blockiness += blockiness_vertical(img1 + j, img1_pitch,
+ img2 + j, img2_pitch, 4);
+ blockiness += blockiness_horizontal(img1 + j, img1_pitch,
+ img2 + j, img2_pitch, 4);
+ }
+ }
+ }
+ blockiness /= width * height / 16;
+ return blockiness;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
index 12acc51143a..f647ab39535 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
@@ -87,7 +87,7 @@ static void free_tree_contexts(PC_TREE *tree) {
// partition level. There are contexts for none, horizontal, vertical, and
// split. Along with a block_size value and a selected block_size which
// represents the state of our search.
-void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
+void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
int i, j;
const int leaf_nodes = 64;
const int tree_nodes = 64 + 16 + 4 + 1;
@@ -97,24 +97,24 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
int square_index = 1;
int nodes;
- vpx_free(cpi->leaf_tree);
- CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes,
- sizeof(*cpi->leaf_tree)));
- vpx_free(cpi->pc_tree);
- CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes,
- sizeof(*cpi->pc_tree)));
+ vpx_free(td->leaf_tree);
+ CHECK_MEM_ERROR(cm, td->leaf_tree, vpx_calloc(leaf_nodes,
+ sizeof(*td->leaf_tree)));
+ vpx_free(td->pc_tree);
+ CHECK_MEM_ERROR(cm, td->pc_tree, vpx_calloc(tree_nodes,
+ sizeof(*td->pc_tree)));
- this_pc = &cpi->pc_tree[0];
- this_leaf = &cpi->leaf_tree[0];
+ this_pc = &td->pc_tree[0];
+ this_leaf = &td->leaf_tree[0];
// 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
// context so we only need to allocate 1 for each 8x8 block.
for (i = 0; i < leaf_nodes; ++i)
- alloc_mode_context(cm, 1, &cpi->leaf_tree[i]);
+ alloc_mode_context(cm, 1, &td->leaf_tree[i]);
// Sets up all the leaf nodes in the tree.
for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
- PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
tree->block_size = square[0];
alloc_tree_contexts(cm, tree, 4);
tree->leaf_split[0] = this_leaf++;
@@ -126,7 +126,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
// from leafs to the root.
for (nodes = 16; nodes > 0; nodes >>= 2) {
for (i = 0; i < nodes; ++i) {
- PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
tree->block_size = square[square_index];
for (j = 0; j < 4; j++)
@@ -135,24 +135,24 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
}
++square_index;
}
- cpi->pc_root = &cpi->pc_tree[tree_nodes - 1];
- cpi->pc_root[0].none.best_mode_index = 2;
+ td->pc_root = &td->pc_tree[tree_nodes - 1];
+ td->pc_root[0].none.best_mode_index = 2;
}
-void vp9_free_pc_tree(VP9_COMP *cpi) {
+void vp9_free_pc_tree(ThreadData *td) {
const int tree_nodes = 64 + 16 + 4 + 1;
int i;
// Set up all 4x4 mode contexts
for (i = 0; i < 64; ++i)
- free_mode_context(&cpi->leaf_tree[i]);
+ free_mode_context(&td->leaf_tree[i]);
// Sets up all the leaf nodes in the tree.
for (i = 0; i < tree_nodes; ++i)
- free_tree_contexts(&cpi->pc_tree[i]);
+ free_tree_contexts(&td->pc_tree[i]);
- vpx_free(cpi->pc_tree);
- cpi->pc_tree = NULL;
- vpx_free(cpi->leaf_tree);
- cpi->leaf_tree = NULL;
+ vpx_free(td->pc_tree);
+ td->pc_tree = NULL;
+ vpx_free(td->leaf_tree);
+ td->leaf_tree = NULL;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
index 6b28ee59182..70bf032c34f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
@@ -15,6 +15,7 @@
struct VP9_COMP;
struct VP9Common;
+struct ThreadData;
// Structure to hold snapshot of coding context during the mode picking process
typedef struct {
@@ -34,6 +35,7 @@ typedef struct {
int is_coded;
int num_4x4_blk;
int skip;
+ int pred_pixel_ready;
// For current partition, only if all Y, U, and V transform blocks'
// coefficients are quantized to 0, skippable is set to 0.
int skippable;
@@ -45,6 +47,11 @@ typedef struct {
int64_t tx_rd_diff[TX_MODES];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+ // scope of refactoring.
+ int rate;
+ int64_t dist;
+
#if CONFIG_VP9_TEMPORAL_DENOISING
unsigned int newmv_sse;
unsigned int zeromv_sse;
@@ -73,7 +80,7 @@ typedef struct PC_TREE {
};
} PC_TREE;
-void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi);
-void vp9_free_pc_tree(struct VP9_COMP *cpi);
+void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
+void vp9_free_pc_tree(struct ThreadData *td);
#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
index 1090d04bb18..9e6ca3d594c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
@@ -17,6 +17,7 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_dct.h"
static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
@@ -26,7 +27,7 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
return rv;
}
-static void fdct4(const tran_low_t *input, tran_low_t *output) {
+void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
tran_high_t step[4];
tran_high_t temp1, temp2;
@@ -37,12 +38,12 @@ static void fdct4(const tran_low_t *input, tran_low_t *output) {
temp1 = (step[0] + step[1]) * cospi_16_64;
temp2 = (step[0] - step[1]) * cospi_16_64;
- output[0] = fdct_round_shift(temp1);
- output[2] = fdct_round_shift(temp2);
+ output[0] = (tran_low_t)fdct_round_shift(temp1);
+ output[2] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
- output[1] = fdct_round_shift(temp1);
- output[3] = fdct_round_shift(temp2);
+ output[1] = (tran_low_t)fdct_round_shift(temp1);
+ output[3] = (tran_low_t)fdct_round_shift(temp2);
}
void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -98,12 +99,12 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
step[3] = input[0] - input[3];
temp1 = (step[0] + step[1]) * cospi_16_64;
temp2 = (step[0] - step[1]) * cospi_16_64;
- out[0] = fdct_round_shift(temp1);
- out[2] = fdct_round_shift(temp2);
+ out[0] = (tran_low_t)fdct_round_shift(temp1);
+ out[2] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
- out[1] = fdct_round_shift(temp1);
- out[3] = fdct_round_shift(temp2);
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[3] = (tran_low_t)fdct_round_shift(temp2);
// Do next column (which is a transposed row in second/horizontal pass)
in_pass0++;
in++;
@@ -123,7 +124,7 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
}
}
-static void fadst4(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t x0, x1, x2, x3;
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -157,26 +158,18 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
s3 = x2 - x0 + x3;
// 1-D transform scaling factor is sqrt(2).
- output[0] = fdct_round_shift(s0);
- output[1] = fdct_round_shift(s1);
- output[2] = fdct_round_shift(s2);
- output[3] = fdct_round_shift(s3);
+ output[0] = (tran_low_t)fdct_round_shift(s0);
+ output[1] = (tran_low_t)fdct_round_shift(s1);
+ output[2] = (tran_low_t)fdct_round_shift(s2);
+ output[3] = (tran_low_t)fdct_round_shift(s3);
}
-static const transform_2d FHT_4[] = {
- { fdct4, fdct4 }, // DCT_DCT = 0
- { fadst4, fdct4 }, // ADST_DCT = 1
- { fdct4, fadst4 }, // DCT_ADST = 2
- { fadst4, fadst4 } // ADST_ADST = 3
-};
-
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct4x4_c(input, output, stride);
} else {
tran_low_t out[4 * 4];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
@@ -189,7 +182,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- outptr[j * 4 + i] = temp_out[j];
+ out[j * 4 + i] = temp_out[j];
}
// Rows
@@ -203,7 +196,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
}
}
-static void fdct8(const tran_low_t *input, tran_low_t *output) {
+void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
tran_high_t t0, t1, t2, t3; // needs32
tran_high_t x0, x1, x2, x3; // canbe16
@@ -227,16 +220,16 @@ static void fdct8(const tran_low_t *input, tran_low_t *output) {
t1 = (x0 - x1) * cospi_16_64;
t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- output[0] = fdct_round_shift(t0);
- output[2] = fdct_round_shift(t2);
- output[4] = fdct_round_shift(t1);
- output[6] = fdct_round_shift(t3);
+ output[0] = (tran_low_t)fdct_round_shift(t0);
+ output[2] = (tran_low_t)fdct_round_shift(t2);
+ output[4] = (tran_low_t)fdct_round_shift(t1);
+ output[6] = (tran_low_t)fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
+ t2 = (tran_low_t)fdct_round_shift(t0);
+ t3 = (tran_low_t)fdct_round_shift(t1);
// Stage 3
x0 = s4 + t2;
@@ -249,10 +242,10 @@ static void fdct8(const tran_low_t *input, tran_low_t *output) {
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- output[1] = fdct_round_shift(t0);
- output[3] = fdct_round_shift(t2);
- output[5] = fdct_round_shift(t1);
- output[7] = fdct_round_shift(t3);
+ output[1] = (tran_low_t)fdct_round_shift(t0);
+ output[3] = (tran_low_t)fdct_round_shift(t2);
+ output[5] = (tran_low_t)fdct_round_shift(t1);
+ output[7] = (tran_low_t)fdct_round_shift(t3);
}
void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -298,10 +291,10 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
t1 = (x0 - x1) * cospi_16_64;
t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- output[0 * 8] = fdct_round_shift(t0);
- output[2 * 8] = fdct_round_shift(t2);
- output[4 * 8] = fdct_round_shift(t1);
- output[6 * 8] = fdct_round_shift(t3);
+ output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+ output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+ output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+ output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
@@ -320,10 +313,10 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- output[1 * 8] = fdct_round_shift(t0);
- output[3 * 8] = fdct_round_shift(t2);
- output[5 * 8] = fdct_round_shift(t1);
- output[7 * 8] = fdct_round_shift(t3);
+ output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+ output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+ output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+ output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
input++;
output++;
}
@@ -331,12 +324,124 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
// Rows
for (i = 0; i < 8; ++i) {
- fdct8(&intermediate[i * 8], &final_output[i * 8]);
+ vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
for (j = 0; j < 8; ++j)
final_output[j + i * 8] /= 2;
}
}
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
+ tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int eob = -1;
+
+ int i, j;
+ tran_low_t intermediate[64];
+
+ // Transform columns
+ {
+ tran_low_t *output = intermediate;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ int i;
+ for (i = 0; i < 8; i++) {
+ // stage 1
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+ output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+ output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+ output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+ output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+ output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+ output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+ input++;
+ output++;
+ }
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ vp9_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+ for (j = 0; j < 8; ++j)
+ coeff_ptr[j + i * 8] /= 2;
+ }
+
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+ if (tmp)
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
tran_low_t sum = 0;
@@ -434,10 +539,10 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
t1 = (x0 - x1) * cospi_16_64;
t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
- out[0] = fdct_round_shift(t0);
- out[4] = fdct_round_shift(t2);
- out[8] = fdct_round_shift(t1);
- out[12] = fdct_round_shift(t3);
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[4] = (tran_low_t)fdct_round_shift(t2);
+ out[8] = (tran_low_t)fdct_round_shift(t1);
+ out[12] = (tran_low_t)fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
@@ -456,10 +561,10 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- out[2] = fdct_round_shift(t0);
- out[6] = fdct_round_shift(t2);
- out[10] = fdct_round_shift(t1);
- out[14] = fdct_round_shift(t3);
+ out[2] = (tran_low_t)fdct_round_shift(t0);
+ out[6] = (tran_low_t)fdct_round_shift(t2);
+ out[10] = (tran_low_t)fdct_round_shift(t1);
+ out[14] = (tran_low_t)fdct_round_shift(t3);
}
// Work on the next eight values; step1 -> odd_results
{
@@ -502,20 +607,20 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
// step 6
temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
- out[1] = fdct_round_shift(temp1);
- out[9] = fdct_round_shift(temp2);
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[9] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
- out[5] = fdct_round_shift(temp1);
- out[13] = fdct_round_shift(temp2);
+ out[5] = (tran_low_t)fdct_round_shift(temp1);
+ out[13] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
- out[3] = fdct_round_shift(temp1);
- out[11] = fdct_round_shift(temp2);
+ out[3] = (tran_low_t)fdct_round_shift(temp1);
+ out[11] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
- out[7] = fdct_round_shift(temp1);
- out[15] = fdct_round_shift(temp2);
+ out[7] = (tran_low_t)fdct_round_shift(temp1);
+ out[15] = (tran_low_t)fdct_round_shift(temp2);
}
// Do next column (which is a transposed row in second/horizontal pass)
in++;
@@ -528,7 +633,7 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
}
}
-static void fadst8(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
tran_high_t x0 = input[7];
@@ -589,30 +694,22 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
x6 = fdct_round_shift(s6);
x7 = fdct_round_shift(s7);
- output[0] = x0;
- output[1] = - x4;
- output[2] = x6;
- output[3] = - x2;
- output[4] = x3;
- output[5] = - x7;
- output[6] = x5;
- output[7] = - x1;
+ output[0] = (tran_low_t)x0;
+ output[1] = (tran_low_t)-x4;
+ output[2] = (tran_low_t)x6;
+ output[3] = (tran_low_t)-x2;
+ output[4] = (tran_low_t)x3;
+ output[5] = (tran_low_t)-x7;
+ output[6] = (tran_low_t)x5;
+ output[7] = (tran_low_t)-x1;
}
-static const transform_2d FHT_8[] = {
- { fdct8, fdct8 }, // DCT_DCT = 0
- { fadst8, fdct8 }, // ADST_DCT = 1
- { fdct8, fadst8 }, // DCT_ADST = 2
- { fadst8, fadst8 } // ADST_ADST = 3
-};
-
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct8x8_c(input, output, stride);
} else {
tran_low_t out[64];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
@@ -623,7 +720,7 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- outptr[j * 8 + i] = temp_out[j];
+ out[j * 8 + i] = temp_out[j];
}
// Rows
@@ -659,10 +756,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
c1 = e1 - c1;
a1 -= c1;
d1 += b1;
- op[0] = a1;
- op[4] = c1;
- op[8] = d1;
- op[12] = b1;
+ op[0] = (tran_low_t)a1;
+ op[4] = (tran_low_t)c1;
+ op[8] = (tran_low_t)d1;
+ op[12] = (tran_low_t)b1;
ip_pass0++;
op++;
@@ -683,10 +780,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
c1 = e1 - c1;
a1 -= c1;
d1 += b1;
- op[0] = a1 * UNIT_QUANT_FACTOR;
- op[1] = c1 * UNIT_QUANT_FACTOR;
- op[2] = d1 * UNIT_QUANT_FACTOR;
- op[3] = b1 * UNIT_QUANT_FACTOR;
+ op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+ op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+ op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+ op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
ip += 4;
op += 4;
@@ -694,7 +791,7 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
}
// Rewrote to use same algorithm as others.
-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
tran_high_t step1[8]; // canbe16
tran_high_t step2[8]; // canbe16
tran_high_t step3[8]; // canbe16
@@ -745,10 +842,10 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
t1 = (x0 - x1) * cospi_16_64;
t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
- out[0] = fdct_round_shift(t0);
- out[4] = fdct_round_shift(t2);
- out[8] = fdct_round_shift(t1);
- out[12] = fdct_round_shift(t3);
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[4] = (tran_low_t)fdct_round_shift(t2);
+ out[8] = (tran_low_t)fdct_round_shift(t1);
+ out[12] = (tran_low_t)fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
@@ -767,10 +864,10 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- out[2] = fdct_round_shift(t0);
- out[6] = fdct_round_shift(t2);
- out[10] = fdct_round_shift(t1);
- out[14] = fdct_round_shift(t3);
+ out[2] = (tran_low_t)fdct_round_shift(t0);
+ out[6] = (tran_low_t)fdct_round_shift(t2);
+ out[10] = (tran_low_t)fdct_round_shift(t1);
+ out[14] = (tran_low_t)fdct_round_shift(t3);
}
// step 2
@@ -816,26 +913,26 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
// step 6
temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
- out[1] = fdct_round_shift(temp1);
- out[9] = fdct_round_shift(temp2);
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[9] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
- out[5] = fdct_round_shift(temp1);
- out[13] = fdct_round_shift(temp2);
+ out[5] = (tran_low_t)fdct_round_shift(temp1);
+ out[13] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
- out[3] = fdct_round_shift(temp1);
- out[11] = fdct_round_shift(temp2);
+ out[3] = (tran_low_t)fdct_round_shift(temp1);
+ out[11] = (tran_low_t)fdct_round_shift(temp2);
temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
- out[7] = fdct_round_shift(temp1);
- out[15] = fdct_round_shift(temp2);
+ out[7] = (tran_low_t)fdct_round_shift(temp1);
+ out[15] = (tran_low_t)fdct_round_shift(temp2);
}
-static void fadst16(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
@@ -980,38 +1077,30 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
x14 = fdct_round_shift(s14);
x15 = fdct_round_shift(s15);
- output[0] = x0;
- output[1] = - x8;
- output[2] = x12;
- output[3] = - x4;
- output[4] = x6;
- output[5] = x14;
- output[6] = x10;
- output[7] = x2;
- output[8] = x3;
- output[9] = x11;
- output[10] = x15;
- output[11] = x7;
- output[12] = x5;
- output[13] = - x13;
- output[14] = x9;
- output[15] = - x1;
+ output[0] = (tran_low_t)x0;
+ output[1] = (tran_low_t)-x8;
+ output[2] = (tran_low_t)x12;
+ output[3] = (tran_low_t)-x4;
+ output[4] = (tran_low_t)x6;
+ output[5] = (tran_low_t)x14;
+ output[6] = (tran_low_t)x10;
+ output[7] = (tran_low_t)x2;
+ output[8] = (tran_low_t)x3;
+ output[9] = (tran_low_t)x11;
+ output[10] = (tran_low_t)x15;
+ output[11] = (tran_low_t)x7;
+ output[12] = (tran_low_t)x5;
+ output[13] = (tran_low_t)-x13;
+ output[14] = (tran_low_t)x9;
+ output[15] = (tran_low_t)-x1;
}
-static const transform_2d FHT_16[] = {
- { fdct16, fdct16 }, // DCT_DCT = 0
- { fadst16, fdct16 }, // ADST_DCT = 1
- { fdct16, fadst16 }, // DCT_ADST = 2
- { fadst16, fadst16 } // ADST_ADST = 3
-};
-
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct16x16_c(input, output, stride);
} else {
tran_low_t out[256];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
@@ -1022,7 +1111,7 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Rows
@@ -1049,7 +1138,7 @@ static INLINE tran_high_t half_round_shift(tran_high_t input) {
return rv;
}
-static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
tran_high_t step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
@@ -1392,7 +1481,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
- fdct32(temp_in, temp_out, 0);
+ vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
@@ -1402,9 +1491,10 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
- fdct32(temp_in, temp_out, 0);
+ vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
- out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
}
}
@@ -1420,7 +1510,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
- fdct32(temp_in, temp_out, 0);
+ vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1433,9 +1523,9 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
- fdct32(temp_in, temp_out, 1);
+ vp9_fdct32(temp_in, temp_out, 1);
for (j = 0; j < 32; ++j)
- out[j + i * 32] = temp_out[j];
+ out[j + i * 32] = (tran_low_t)temp_out[j];
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h
new file mode 100644
index 00000000000..49afcbbd5b8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+#include "vp9/common/vp9_idct.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+ int stride);
+void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
+void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+ int stride);
+
+void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
+void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
+void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
+void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+
+static const transform_2d FHT_4[] = {
+ { vp9_fdct4, vp9_fdct4 }, // DCT_DCT = 0
+ { vp9_fadst4, vp9_fdct4 }, // ADST_DCT = 1
+ { vp9_fdct4, vp9_fadst4 }, // DCT_ADST = 2
+ { vp9_fadst4, vp9_fadst4 } // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+ { vp9_fdct8, vp9_fdct8 }, // DCT_DCT = 0
+ { vp9_fadst8, vp9_fdct8 }, // ADST_DCT = 1
+ { vp9_fdct8, vp9_fadst8 }, // DCT_ADST = 2
+ { vp9_fadst8, vp9_fadst8 } // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+ { vp9_fdct16, vp9_fdct16 }, // DCT_DCT = 0
+ { vp9_fadst16, vp9_fdct16 }, // ADST_DCT = 1
+ { vp9_fdct16, vp9_fadst16 }, // DCT_ADST = 2
+ { vp9_fadst16, vp9_fadst16 } // ADST_ADST = 3
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP9_ENCODER_VP9_DCT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
index 4deeed2170c..08134e152aa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
@@ -45,34 +45,29 @@ static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
(void)bs;
(void)increase_denoising;
- return 25 * 25;
+ return 625;
}
static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) *
- (increase_denoising ? 60 : 40);
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
}
static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
- int mv_row, int mv_col) {
- if (mv_row * mv_row + mv_col * mv_col >
+ int motion_magnitude) {
+ if (motion_magnitude >
noise_motion_thresh(bs, increase_denoising)) {
return 0;
} else {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) * 20;
+ return (1 << num_pels_log2_lookup[bs]) * 20;
}
}
int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
}
static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
}
// TODO(jackychen): If increase_denoising is enabled in the future,
@@ -195,16 +190,6 @@ static uint8_t *block_start(uint8_t *framebuf, int stride,
return framebuf + (stride * mi_row * 8) + (mi_col * 8);
}
-static void copy_block(uint8_t *dest, int dest_stride,
- const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
- int r;
- for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
- vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
- dest += dest_stride;
- src += src_stride;
- }
-}
-
static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
MACROBLOCK *mb,
BLOCK_SIZE bs,
@@ -218,33 +203,23 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
MV_REFERENCE_FRAME frame;
MACROBLOCKD *filter_mbd = &mb->e_mbd;
- MB_MODE_INFO *mbmi = &filter_mbd->mi[0].src_mi->mbmi;
-
+ MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
MB_MODE_INFO saved_mbmi;
int i, j;
struct buf_2d saved_dst[MAX_MB_PLANE];
struct buf_2d saved_pre[MAX_MB_PLANE][2]; // 2 pre buffers
- // We will restore these after motion compensation.
- saved_mbmi = *mbmi;
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- for (j = 0; j < 2; ++j) {
- saved_pre[i][j] = filter_mbd->plane[i].pre[j];
- }
- saved_dst[i] = filter_mbd->plane[i].dst;
- }
-
mv_col = ctx->best_sse_mv.as_mv.col;
mv_row = ctx->best_sse_mv.as_mv.row;
-
*motion_magnitude = mv_row * mv_row + mv_col * mv_col;
-
frame = ctx->best_reference_frame;
+ saved_mbmi = *mbmi;
+
// If the best reference frame uses inter-prediction and there is enough of a
// difference in sum-squared-error, use it.
if (frame != INTRA_FRAME &&
- sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) {
+ sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
mbmi->ref_frame[0] = ctx->best_reference_frame;
mbmi->mode = ctx->best_sse_inter_mode;
mbmi->mv[0] = ctx->best_sse_mv;
@@ -261,6 +236,26 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
ctx->newmv_sse = ctx->zeromv_sse;
}
+ if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+ // Restore everything to its original state
+ *mbmi = saved_mbmi;
+ return COPY_BLOCK;
+ }
+ if (*motion_magnitude >
+ (noise_motion_thresh(bs, increase_denoising) << 3)) {
+ // Restore everything to its original state
+ *mbmi = saved_mbmi;
+ return COPY_BLOCK;
+ }
+
+ // We will restore these after motion compensation.
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ for (j = 0; j < 2; ++j) {
+ saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+ }
+ saved_dst[i] = filter_mbd->plane[i].dst;
+ }
+
// Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
// struct.
for (j = 0; j < 2; ++j) {
@@ -313,13 +308,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
mv_row = ctx->best_sse_mv.as_mv.row;
mv_col = ctx->best_sse_mv.as_mv.col;
- if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
- return COPY_BLOCK;
- }
- if (mv_row * mv_row + mv_col * mv_col >
- 8 * noise_motion_thresh(bs, increase_denoising)) {
- return COPY_BLOCK;
- }
return FILTER_BLOCK;
}
@@ -348,9 +336,15 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
}
if (decision == FILTER_BLOCK) {
- copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+ vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ NULL, 0, NULL, 0,
+ num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
- copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+ vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ NULL, 0, NULL, 0,
+ num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
}
}
@@ -358,16 +352,26 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
int r;
const uint8_t *srcbuf = src.y_buffer;
uint8_t *destbuf = dest.y_buffer;
+
assert(dest.y_width == src.y_width);
assert(dest.y_height == src.y_height);
for (r = 0; r < dest.y_height; ++r) {
- vpx_memcpy(destbuf, srcbuf, dest.y_width);
+ memcpy(destbuf, srcbuf, dest.y_width);
destbuf += dest.y_stride;
srcbuf += src.y_stride;
}
}
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+ YV12_BUFFER_CONFIG *src) {
+ uint8_t *tmp_buf = dest->y_buffer;
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+ dest->y_buffer = src->y_buffer;
+ src->y_buffer = tmp_buf;
+}
+
void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
YV12_BUFFER_CONFIG src,
FRAME_TYPE frame_type,
@@ -377,22 +381,23 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
if (frame_type == KEY_FRAME) {
int i;
// Start at 1 so as not to overwrite the INTRA_FRAME
- for (i = 1; i < MAX_REF_FRAMES; ++i) {
+ for (i = 1; i < MAX_REF_FRAMES; ++i)
copy_frame(denoiser->running_avg_y[i], src);
- }
- } else { /* For non key frames */
- if (refresh_alt_ref_frame) {
- copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
- }
- if (refresh_golden_frame) {
- copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
- }
- if (refresh_last_frame) {
- copy_frame(denoiser->running_avg_y[LAST_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
- }
+ return;
+ }
+
+ /* For non key frames */
+ if (refresh_alt_ref_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+ &denoiser->running_avg_y[INTRA_FRAME]);
+ }
+ if (refresh_golden_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+ &denoiser->running_avg_y[INTRA_FRAME]);
+ }
+ if (refresh_last_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+ &denoiser->running_avg_y[INTRA_FRAME]);
}
}
@@ -410,7 +415,7 @@ void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
}
- if (mode == NEWMV) {
+ if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
ctx->newmv_sse = sse;
ctx->best_sse_inter_mode = mode;
ctx->best_sse_mv = mbmi->mv[0];
@@ -425,6 +430,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#endif
int border) {
int i, fail;
+ const int legacy_byte_alignment = 0;
assert(denoiser != NULL);
for (i = 0; i < MAX_REF_FRAMES; ++i) {
@@ -433,7 +439,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- border);
+ border, legacy_byte_alignment);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
@@ -448,7 +454,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- border);
+ border, legacy_byte_alignment);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
@@ -457,23 +463,21 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
make_grayscale(&denoiser->running_avg_y[i]);
#endif
denoiser->increase_denoising = 0;
+ denoiser->frame_buffer_initialized = 1;
return 0;
}
void vp9_denoiser_free(VP9_DENOISER *denoiser) {
int i;
+ denoiser->frame_buffer_initialized = 0;
if (denoiser == NULL) {
return;
}
for (i = 0; i < MAX_REF_FRAMES; ++i) {
- if (&denoiser->running_avg_y[i] != NULL) {
- vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
- }
- }
- if (&denoiser->mc_running_avg_y != NULL) {
- vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
+ vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
}
+ vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
}
#ifdef OUTPUT_YUV_DENOISED
@@ -482,15 +486,13 @@ static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
uint8_t *u = yuv->u_buffer;
uint8_t *v = yuv->v_buffer;
- // The '/2's are there because we have a 440 buffer, but we want to output
- // 420.
- for (r = 0; r < yuv->uv_height / 2; ++r) {
- for (c = 0; c < yuv->uv_width / 2; ++c) {
+ for (r = 0; r < yuv->uv_height; ++r) {
+ for (c = 0; c < yuv->uv_width; ++c) {
u[c] = UINT8_MAX / 2;
v[c] = UINT8_MAX / 2;
}
- u += yuv->uv_stride + yuv->uv_width / 2;
- v += yuv->uv_stride + yuv->uv_width / 2;
+ u += yuv->uv_stride;
+ v += yuv->uv_stride;
}
}
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
index 421dfcd0cca..8eb5da1b8aa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
@@ -29,6 +29,7 @@ typedef struct vp9_denoiser {
YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG mc_running_avg_y;
int increase_denoising;
+ int frame_buffer_initialized;
} VP9_DENOISER;
void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 6eff8c501ba..0e74784e9b6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -36,6 +36,7 @@
#include "vp9/encoder/vp9_encodeframe.h"
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_extend.h"
#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_rd.h"
@@ -43,19 +44,11 @@
#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/encoder/vp9_tokenize.h"
-#define GF_ZEROMV_ZBIN_BOOST 0
-#define LF_ZEROMV_ZBIN_BOOST 0
-#define MV_ZBIN_BOOST 0
-#define SPLIT_MV_ZBIN_BOOST 0
-#define INTRA_ZBIN_BOOST 0
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+static void encode_superblock(VP9_COMP *cpi, ThreadData * td,
+ TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx);
-// Motion vector component magnitude threshold for defining fast motion.
-#define FAST_MOTION_MV_THRESH 24
-
// This is used as a reference when computing the source variance for the
// purposes of activity masking.
// Eventually this should be replaced by custom no-reference routines,
@@ -106,9 +99,9 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
};
#endif // CONFIG_VP9_HIGHBITDEPTH
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
- const struct buf_2d *ref,
- BLOCK_SIZE bs) {
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
unsigned int sse;
const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
VP9_VAR_OFFS, 0, &sse);
@@ -116,7 +109,7 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
}
#if CONFIG_VP9_HIGHBITDEPTH
-static unsigned int high_get_sby_perpixel_variance(
+unsigned int vp9_high_get_sby_perpixel_variance(
VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
unsigned int var, sse;
switch (bd) {
@@ -145,19 +138,21 @@ static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
const struct buf_2d *ref,
int mi_row, int mi_col,
BLOCK_SIZE bs) {
+ unsigned int sse, var;
+ uint8_t *last_y;
const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
- const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride +
- mi_col * MI_SIZE];
- unsigned int sse;
- const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
- last_y, last->y_stride, &sse);
+
+ assert(last != NULL);
+ last_y =
+ &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+ var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
}
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row,
int mi_col) {
- unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+ unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
mi_row, mi_col,
BLOCK_64X64);
if (var < 8)
@@ -170,34 +165,20 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
return BLOCK_8X8;
}
-static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
- int mi_row,
- int mi_col) {
- unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
- mi_row, mi_col,
- BLOCK_64X64);
- if (var < 4)
- return BLOCK_64X64;
- else if (var < 10)
- return BLOCK_32X32;
- else
- return BLOCK_16X16;
-}
-
// Lighter version of set_offsets that only sets the mode info
// pointers.
-static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
- MACROBLOCKD *const xd,
- int mi_row,
- int mi_col) {
+static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ int mi_row,
+ int mi_col) {
const int idx_str = xd->mi_stride * mi_row + mi_col;
- xd->mi = cm->mi + idx_str;
- xd->mi[0].src_mi = &xd->mi[0];
+ xd->mi = cm->mi_grid_visible + idx_str;
+ xd->mi[0] = cm->mi + idx_str;
}
static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, int mi_col, BLOCK_SIZE bsize) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
@@ -207,9 +188,9 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
set_skip_context(xd, mi_row, mi_col);
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
- mbmi = &xd->mi[0].src_mi->mbmi;
+ mbmi = &xd->mi[0]->mbmi;
// Set up destination pointers.
vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
@@ -258,25 +239,24 @@ static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
for (j = 0; j < block_height; ++j)
for (i = 0; i < block_width; ++i) {
if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
- xd->mi[j * xd->mi_stride + i].src_mi = &xd->mi[0];
+ xd->mi[j * xd->mi_stride + i] = xd->mi[0];
}
}
static void set_block_size(VP9_COMP * const cpi,
+ MACROBLOCKD *const xd,
int mi_row, int mi_col,
BLOCK_SIZE bsize) {
if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col);
- xd->mi[0].src_mi->mbmi.sb_type = bsize;
- duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
+ set_mode_info_offsets(&cpi->common, xd, mi_row, mi_col);
+ xd->mi[0]->mbmi.sb_type = bsize;
}
}
typedef struct {
int64_t sum_square_error;
int64_t sum_error;
- int count;
+ int log2_count;
int variance;
} var;
@@ -289,6 +269,11 @@ typedef struct {
typedef struct {
partition_variance part_variances;
var split[4];
+} v4x4;
+
+typedef struct {
+ partition_variance part_variances;
+ v4x4 split[4];
} v8x8;
typedef struct {
@@ -320,7 +305,6 @@ typedef enum {
static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
int i;
node->part_variances = NULL;
- vpx_memset(node->split, 0, sizeof(node->split));
switch (bsize) {
case BLOCK_64X64: {
v64x64 *vt = (v64x64 *) data;
@@ -347,6 +331,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
v8x8 *vt = (v8x8 *) data;
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_4X4: {
+ v4x4 *vt = (v4x4 *) data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
node->split[i] = &vt->split[i];
break;
}
@@ -361,18 +352,18 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
v->sum_square_error = s2;
v->sum_error = s;
- v->count = c;
- if (c > 0)
- v->variance = (int)(256 *
- (v->sum_square_error - v->sum_error * v->sum_error /
- v->count) / v->count);
- else
- v->variance = 0;
+ v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+ v->variance = (int)(256 * (v->sum_square_error -
+ ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
}
void sum_2_variances(const var *a, const var *b, var *r) {
+ assert(a->log2_count == b->log2_count);
fill_variance(a->sum_square_error + b->sum_square_error,
- a->sum_error + b->sum_error, a->count + b->count, r);
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
}
static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
@@ -387,93 +378,312 @@ static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
}
static int set_vt_partitioning(VP9_COMP *cpi,
+ MACROBLOCKD *const xd,
void *data,
BLOCK_SIZE bsize,
int mi_row,
- int mi_col) {
+ int mi_col,
+ int64_t threshold,
+ BLOCK_SIZE bsize_min,
+ int force_split) {
VP9_COMMON * const cm = &cpi->common;
variance_node vt;
const int block_width = num_8x8_blocks_wide_lookup[bsize];
const int block_height = num_8x8_blocks_high_lookup[bsize];
- // TODO(debargha): Choose this more intelligently.
- const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 64 : 4;
- int64_t threshold =
- (int64_t)(threshold_multiplier *
- vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+
assert(block_height == block_width);
tree_to_node(data, bsize, &vt);
- // Split none is available only if we have more than half a block size
- // in width and height inside the visible image.
- if (mi_col + block_width / 2 < cm->mi_cols &&
- mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->none.variance < threshold) {
- set_block_size(cpi, mi_row, mi_col, bsize);
- return 1;
- }
-
- // Only allow split for blocks above 16x16.
- if (bsize > BLOCK_16X16) {
- // Vertical split is available on all but the bottom border.
- if (mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->vert[0].variance < threshold &&
- vt.part_variances->vert[1].variance < threshold) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
- set_block_size(cpi, mi_row, mi_col, subsize);
- set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
- return 1;
- }
+ if (force_split == 1)
+ return 0;
- // Horizontal split is available on all but the right border.
+ // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+ // variance is below threshold, otherwise split will be selected.
+ // No check for vert/horiz split as too few samples for variance.
+ if (bsize == bsize_min) {
+ // Variance already computed to set the force_split.
+ if (low_res || cm->frame_type == KEY_FRAME)
+ get_variance(&vt.part_variances->none);
if (mi_col + block_width / 2 < cm->mi_cols &&
- vt.part_variances->horz[0].variance < threshold &&
- vt.part_variances->horz[1].variance < threshold) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
- set_block_size(cpi, mi_row, mi_col, subsize);
- set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+ mi_row + block_height / 2 < cm->mi_rows &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, xd, mi_row, mi_col, bsize);
return 1;
}
- }
-
- // This will only allow 8x8 if the 16x16 variance is very large.
- if (bsize == BLOCK_16X16) {
+ return 0;
+ } else if (bsize > bsize_min) {
+ // Variance already computed to set the force_split.
+ if (low_res || cm->frame_type == KEY_FRAME)
+ get_variance(&vt.part_variances->none);
+ // For key frame: take split for bsize above 32X32 or very high variance.
+ if (cm->frame_type == KEY_FRAME &&
+ (bsize > BLOCK_32X32 ||
+ vt.part_variances->none.variance > (threshold << 4))) {
+ return 0;
+ }
+ // If variance is low, take the bsize (no split).
if (mi_col + block_width / 2 < cm->mi_cols &&
mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->none.variance < (threshold << 6)) {
- set_block_size(cpi, mi_row, mi_col, bsize);
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, xd, mi_row, mi_col, bsize);
return 1;
}
+
+ // Check vertical split.
+ if (mi_row + block_height / 2 < cm->mi_rows) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold &&
+ vt.part_variances->vert[1].variance < threshold &&
+ get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+ set_block_size(cpi, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
+ }
+ // Check horizontal split.
+ if (mi_col + block_width / 2 < cm->mi_cols) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold &&
+ vt.part_variances->horz[1].variance < threshold &&
+ get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+ set_block_size(cpi, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
+ }
+
+ return 0;
}
return 0;
}
-// This function chooses partitioning based on the variance
-// between source and reconstructed last, where variance is
-// computed for 8x8 downsampled inputs. Some things to check:
-// using the last source rather than reconstructed last, and
-// allowing for small downsampling (4x4 or 2x2) for selection
-// of smaller block sizes (i.e., < 16x16).
-static void choose_partitioning(VP9_COMP *cpi,
+void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ if (sf->partition_search_type != VAR_BASED_PARTITION &&
+ sf->partition_search_type != REFERENCE_PARTITION) {
+ return;
+ } else {
+ VP9_COMMON *const cm = &cpi->common;
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
+ const int threshold_multiplier = is_key_frame ? 20 : 1;
+ const int64_t threshold_base = (int64_t)(threshold_multiplier *
+ cpi->y_dequant[q][1]);
+
+ // TODO(marpan): Allow 4x4 partitions for inter-frames.
+ // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
+ // If 4x4 partition is not used, then 8x8 partition will be selected
+ // if variance of 16x16 block is very high, so use larger threshold
+ // for 16x16 (threshold_bsize_min) in that case.
+
+ // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+ // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+ if (is_key_frame) {
+ cpi->vbp_thresholds[0] = threshold_base;
+ cpi->vbp_thresholds[1] = threshold_base >> 2;
+ cpi->vbp_thresholds[2] = threshold_base >> 2;
+ cpi->vbp_thresholds[3] = threshold_base << 2;
+ cpi->vbp_threshold_sad = 0;
+ cpi->vbp_bsize_min = BLOCK_8X8;
+ } else {
+ cpi->vbp_thresholds[1] = threshold_base;
+ if (cm->width <= 352 && cm->height <= 288) {
+ cpi->vbp_thresholds[0] = threshold_base >> 2;
+ cpi->vbp_thresholds[2] = threshold_base << 3;
+ cpi->vbp_threshold_sad = 100;
+ } else {
+ cpi->vbp_thresholds[0] = threshold_base;
+ cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2;
+ cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed;
+ cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
+ (cpi->y_dequant[q][1] << 1) : 1000;
+ }
+ cpi->vbp_bsize_min = BLOCK_16X16;
+ }
+ cpi->vbp_threshold_minmax = 15 + (q >> 3);
+ }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide,
+ int pixels_high) {
+ int k;
+ int minmax_max = 0;
+ int minmax_min = 255;
+ // Loop over the 4 8x8 subblocks.
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ int min = 0;
+ int max = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+ d + y8_idx * dp + x8_idx, dp,
+ &min, &max);
+ } else {
+ vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+ d + y8_idx * dp + x8_idx, dp,
+ &min, &max);
+ }
+#else
+ vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+ d + y8_idx * dp + x8_idx, dp,
+ &min, &max);
+#endif
+ if ((max - min) > minmax_max)
+ minmax_max = (max - min);
+ if ((max - min) < minmax_min)
+ minmax_min = (max - min);
+ }
+ }
+ return (minmax_max - minmax_min);
+}
+
+static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]);
+
+ // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+ // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+ thresholds[1] = threshold_base;
+ if (cm->width <= 352 && cm->height <= 288) {
+ thresholds[0] = threshold_base >> 2;
+ thresholds[2] = threshold_base << 3;
+ } else {
+ thresholds[0] = threshold_base;
+ thresholds[1] = (5 * threshold_base) >> 2;
+ thresholds[2] = threshold_base << cpi->oxcf.speed;
+ }
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide,
+ int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x4_idx = x8_idx + ((k & 1) << 2);
+ int y4_idx = y8_idx + ((k >> 1) << 2);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ } else {
+ s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ }
+#else
+ s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide,
+ int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ } else {
+ s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ }
+#else
+ s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+static int choose_partitioning(VP9_COMP *cpi,
const TileInfo *const tile,
+ MACROBLOCK *x,
int mi_row, int mi_col) {
VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK *x = &cpi->mb;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- int i, j, k;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int i, j, k, m;
v64x64 vt;
+ v16x16 vt2[16];
+ int force_split[21];
uint8_t *s;
const uint8_t *d;
int sp;
int dp;
int pixels_wide = 64, pixels_high = 64;
- int_mv nearest_mv, near_mv;
- const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
- const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+ int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+ cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+
+ // Always use 4x4 partition for key frame.
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
+ const int use_4x4_partition = is_key_frame;
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+ int variance4x4downsample[16];
+
+ int segment_id = CR_SEGMENT_ID_BASE;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
+ cm->last_frame_seg_map;
+ segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+ if (cyclic_refresh_segment_id_boosted(segment_id)) {
+ int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ modify_vbp_thresholds(cpi, thresholds, q);
+ }
+ }
- vp9_clear_system_state();
- vp9_zero(vt);
- set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
if (xd->mb_to_right_edge < 0)
pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -483,20 +693,78 @@ static void choose_partitioning(VP9_COMP *cpi,
s = x->plane[0].src.buf;
sp = x->plane[0].src.stride;
- if (cm->frame_type != KEY_FRAME) {
- vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
+ if (!is_key_frame) {
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ unsigned int uv_sad;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+ const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ unsigned int y_sad, y_sad_g;
+ const BLOCK_SIZE bsize = BLOCK_32X32
+ + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
+
+ assert(yv12 != NULL);
+ if (yv12_g && yv12_g != yv12) {
+ vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ y_sad_g = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+ x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ } else {
+ y_sad_g = UINT_MAX;
+ }
+
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[LAST_FRAME - 1].sf);
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE;
+ mbmi->sb_type = BLOCK_64X64;
+ mbmi->mv[0].as_int = 0;
+ mbmi->interp_filter = BILINEAR;
+
+ y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+ if (y_sad_g < y_sad) {
+ vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ mbmi->ref_frame[0] = GOLDEN_FRAME;
+ mbmi->mv[0].as_int = 0;
+ y_sad = y_sad_g;
+ } else {
+ x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+ }
- xd->mi[0].src_mi->mbmi.ref_frame[0] = LAST_FRAME;
- xd->mi[0].src_mi->mbmi.sb_type = BLOCK_64X64;
- vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
- xd->mi[0].src_mi->mbmi.ref_mvs[LAST_FRAME],
- &nearest_mv, &near_mv);
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
- xd->mi[0].src_mi->mbmi.mv[0] = nearest_mv;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+ for (i = 1; i <= 2; ++i) {
+ struct macroblock_plane *p = &x->plane[i];
+ struct macroblockd_plane *pd = &xd->plane[i];
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+ if (bs == BLOCK_INVALID)
+ uv_sad = UINT_MAX;
+ else
+ uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+
+ x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+ }
d = xd->plane[0].dst.buf;
dp = xd->plane[0].dst.stride;
+
+ // If the y_sad is very small, take 64x64 as partition and exit.
+ // Don't check on boosted segment for now, as 64x64 is suppressed there.
+ if (segment_id == CR_SEGMENT_ID_BASE &&
+ y_sad < cpi->vbp_threshold_sad) {
+ const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+ const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ if (mi_col + block_width / 2 < cm->mi_cols &&
+ mi_row + block_height / 2 < cm->mi_rows) {
+ set_block_size(cpi, xd, mi_row, mi_col, BLOCK_64X64);
+ return 0;
+ }
+ }
} else {
d = VP9_VAR_OFFS;
dp = 0;
@@ -518,104 +786,188 @@ static void choose_partitioning(VP9_COMP *cpi,
#endif // CONFIG_VP9_HIGHBITDEPTH
}
- // Fill in the entire tree of 8x8 variances for splits.
+ // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+ // 5-20 for the 16x16 blocks.
+ force_split[0] = 0;
+ // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+ // for splits.
for (i = 0; i < 4; i++) {
const int x32_idx = ((i & 1) << 5);
const int y32_idx = ((i >> 1) << 5);
+ const int i2 = i << 2;
+ force_split[i + 1] = 0;
for (j = 0; j < 4; j++) {
const int x16_idx = x32_idx + ((j & 1) << 4);
const int y16_idx = y32_idx + ((j >> 1) << 4);
+ const int split_index = 5 + i2 + j;
v16x16 *vst = &vt.split[i].split[j];
- for (k = 0; k < 4; k++) {
- int x_idx = x16_idx + ((k & 1) << 3);
- int y_idx = y16_idx + ((k >> 1) << 3);
- unsigned int sse = 0;
- int sum = 0;
-
- if (x_idx < pixels_wide && y_idx < pixels_high) {
- int s_avg, d_avg;
+ force_split[split_index] = 0;
+ variance4x4downsample[i2 + j] = 0;
+ if (!is_key_frame) {
+ fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
- d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
- } else {
- s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
- d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+ xd->cur_buf->flags,
+#endif
+ pixels_wide,
+ pixels_high,
+ is_key_frame);
+ fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+ get_variance(&vt.split[i].split[j].part_variances.none);
+ if (vt.split[i].split[j].part_variances.none.variance >
+ thresholds[2]) {
+ // 16X16 variance is above threshold for split, so force split to 8x8
+ // for this 16x16 block (this also forces splits for upper levels).
+ force_split[split_index] = 1;
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ } else if (vt.split[i].split[j].part_variances.none.variance >
+ thresholds[1] &&
+ !cyclic_refresh_segment_id_boosted(segment_id)) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+ // force split to 8x8 block for this 16x16 block.
+ int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high);
+ if (minmax > cpi->vbp_threshold_minmax) {
+ force_split[split_index] = 1;
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
}
-#else
- s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
- d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+ }
+ }
+ if (is_key_frame || (low_res &&
+ vt.split[i].split[j].part_variances.none.variance >
+ (thresholds[1] << 1))) {
+ force_split[split_index] = 0;
+ // Go down to 4x4 down-sampling for variance.
+ variance4x4downsample[i2 + j] = 1;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ v8x8 *vst2 = is_key_frame ? &vst->split[k] :
+ &vt2[i2 + j].split[k];
+ fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->cur_buf->flags,
#endif
- sum = s_avg - d_avg;
- sse = sum * sum;
+ pixels_wide,
+ pixels_high,
+ is_key_frame);
}
- // For an 8x8 block we have just one value the average of all 64
- // pixels, so use 1. This means of course that there is no variance
- // in an 8x8 block.
- fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
}
}
}
+
// Fill the rest of the variance tree by summing split partition values.
for (i = 0; i < 4; i++) {
+ const int i2 = i << 2;
for (j = 0; j < 4; j++) {
- fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+ if (variance4x4downsample[i2 + j] == 1) {
+ v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
+ &vt.split[i].split[j];
+ for (m = 0; m < 4; m++)
+ fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
+ fill_variance_tree(vtemp, BLOCK_16X16);
+ }
}
fill_variance_tree(&vt.split[i], BLOCK_32X32);
+ // If variance of this 32x32 block is above the threshold, force the block
+ // to split. This also forces a split on the upper (64x64) level.
+ if (!force_split[i + 1]) {
+ get_variance(&vt.split[i].part_variances.none);
+ if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ }
+ }
+ }
+ if (!force_split[0]) {
+ fill_variance_tree(&vt, BLOCK_64X64);
+ get_variance(&vt.part_variances.none);
}
- fill_variance_tree(&vt, BLOCK_64X64);
- // Now go through the entire structure, splitting every block size until
- // we get to one that's got a variance lower than our threshold, or we
- // hit 8x8.
+ // Now go through the entire structure, splitting every block size until
+ // we get to one that's got a variance lower than our threshold.
if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
- !set_vt_partitioning(cpi, &vt, BLOCK_64X64, mi_row, mi_col)) {
+ !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+ thresholds[0], BLOCK_16X16, force_split[0])) {
for (i = 0; i < 4; ++i) {
const int x32_idx = ((i & 1) << 2);
const int y32_idx = ((i >> 1) << 2);
- if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32,
- (mi_row + y32_idx), (mi_col + x32_idx))) {
+ const int i2 = i << 2;
+ if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
+ (mi_row + y32_idx), (mi_col + x32_idx),
+ thresholds[1], BLOCK_16X16,
+ force_split[i + 1])) {
for (j = 0; j < 4; ++j) {
const int x16_idx = ((j & 1) << 1);
const int y16_idx = ((j >> 1) << 1);
- // NOTE: Since this uses 8x8 downsampling for variance calculation
- // we cannot really select block size 8x8 (or even 8x16/16x8),
- // since we do not sufficient samples for variance.
- // For now, 8x8 partition is only set if the variance of the 16x16
- // block is very high. This is controlled in set_vt_partitioning.
- if (!set_vt_partitioning(cpi, &vt.split[i].split[j],
- BLOCK_16X16,
+ // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+ // block, then the variance is based on 4x4 down-sampling, so use vt2
+ // in set_vt_partioning(), otherwise use vt.
+ v16x16 *vtemp = (!is_key_frame &&
+ variance4x4downsample[i2 + j] == 1) ?
+ &vt2[i2 + j] : &vt.split[i].split[j];
+ if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
mi_row + y32_idx + y16_idx,
- mi_col + x32_idx + x16_idx)) {
+ mi_col + x32_idx + x16_idx,
+ thresholds[2],
+ cpi->vbp_bsize_min,
+ force_split[5 + i2 + j])) {
for (k = 0; k < 4; ++k) {
const int x8_idx = (k & 1);
const int y8_idx = (k >> 1);
- set_block_size(cpi,
- (mi_row + y32_idx + y16_idx + y8_idx),
- (mi_col + x32_idx + x16_idx + x8_idx),
- BLOCK_8X8);
+ if (use_4x4_partition) {
+ if (!set_vt_partitioning(cpi, xd, &vtemp->split[k],
+ BLOCK_8X8,
+ mi_row + y32_idx + y16_idx + y8_idx,
+ mi_col + x32_idx + x16_idx + x8_idx,
+ thresholds[3], BLOCK_8X8, 0)) {
+ set_block_size(cpi, xd,
+ (mi_row + y32_idx + y16_idx + y8_idx),
+ (mi_col + x32_idx + x16_idx + x8_idx),
+ BLOCK_4X4);
+ }
+ } else {
+ set_block_size(cpi, xd,
+ (mi_row + y32_idx + y16_idx + y8_idx),
+ (mi_col + x32_idx + x16_idx + x8_idx),
+ BLOCK_8X8);
+ }
}
}
}
}
}
}
+ return 0;
}
-static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void update_state(VP9_COMP *cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *ctx,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int output_enabled) {
int i, x_idx, y;
VP9_COMMON *const cm = &cpi->common;
- RD_OPT *const rd_opt = &cpi->rd;
- MACROBLOCK *const x = &cpi->mb;
+ RD_COUNTS *const rdc = &td->rd_counts;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
MODE_INFO *mi = &ctx->mic;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
- MODE_INFO *mi_addr = &xd->mi[0];
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ MODE_INFO *mi_addr = xd->mi[0];
const struct segmentation *const seg = &cm->seg;
+ const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+ const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+ const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+ MV_REF *const frame_mvs =
+ cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
const int mis = cm->mi_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
@@ -625,10 +977,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
assert(mi->mbmi.sb_type == bsize);
*mi_addr = *mi;
- mi_addr->src_mi = mi_addr;
// If segmentation in use
- if (seg->enabled && output_enabled) {
+ if (seg->enabled) {
// For in frame complexity AQ copy the segment id from the segment map.
if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
const uint8_t *const map = seg->update_map ? cpi->segmentation_map
@@ -639,8 +990,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
// Else for cyclic refresh mode update the segment map, set the segment id
// and then update the quantizer.
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
- vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0].src_mi->mbmi,
- mi_row, mi_col, bsize, 1);
+ vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
+ mi_col, bsize, ctx->rate, ctx->dist,
+ x->skip);
}
}
@@ -665,7 +1017,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
for (x_idx = 0; x_idx < mi_width; x_idx++)
if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
&& (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
- xd->mi[x_idx + y * mis].src_mi = mi_addr;
+ xd->mi[x_idx + y * mis] = mi_addr;
}
if (cpi->oxcf.aq_mode)
@@ -685,15 +1037,15 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
}
x->skip = ctx->skip;
- vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
- sizeof(uint8_t) * ctx->num_4x4_blk);
+ memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+ sizeof(uint8_t) * ctx->num_4x4_blk);
if (!output_enabled)
return;
if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
for (i = 0; i < TX_MODES; i++)
- rd_opt->tx_select_diff[i] += ctx->tx_rd_diff[i];
+ rdc->tx_select_diff[i] += ctx->tx_rd_diff[i];
}
#if CONFIG_INTERNAL_STATS
@@ -718,20 +1070,31 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
#endif
if (!frame_is_intra_only(cm)) {
if (is_inter_block(mbmi)) {
- vp9_update_mv_count(cm, xd);
+ vp9_update_mv_count(td);
if (cm->interp_filter == SWITCHABLE) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
- ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+ ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
}
}
- rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
- rd_opt->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
- rd_opt->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+ rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+ rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+ rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- rd_opt->filter_diff[i] += ctx->best_filter_diff[i];
+ rdc->filter_diff[i] += ctx->best_filter_diff[i];
+ }
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ }
}
}
@@ -750,16 +1113,16 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
x->e_mbd.plane[i].subsampling_y);
}
-static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
- int64_t *dist, BLOCK_SIZE bsize) {
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+ RD_COST *rd_cost, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
INTERP_FILTER filter_ref;
if (xd->up_available)
- filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
+ filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
else if (xd->left_available)
- filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter;
+ filter_ref = xd->mi[-1]->mbmi.interp_filter;
else
filter_ref = EIGHTTAP;
@@ -774,35 +1137,46 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
mbmi->mv[0].as_int = 0;
mbmi->interp_filter = filter_ref;
- xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = 0;
+ xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
x->skip = 1;
- *rate = 0;
- *dist = 0;
+ vp9_rd_cost_init(rd_cost);
}
-static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+static int set_segment_rdmult(VP9_COMP *const cpi,
+ MACROBLOCK *const x,
+ int8_t segment_id) {
+ int segment_qindex;
+ VP9_COMMON *const cm = &cpi->common;
+ vp9_init_plane_quantizers(cpi, x);
+ vp9_clear_system_state();
+ segment_qindex = vp9_get_qindex(&cm->seg, segment_id,
+ cm->base_qindex);
+ return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(VP9_COMP *cpi,
+ TileDataEnc *tile_data,
+ MACROBLOCK *const x,
int mi_row, int mi_col, RD_COST *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
int i, orig_rdmult;
- double rdmult_ratio;
vp9_clear_system_state();
- rdmult_ratio = 1.0; // avoid uninitialized warnings
// Use the lower precision, but faster, 32x32 fdct for mode selection.
x->use_lp32x32fdct = 1;
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
- mbmi = &xd->mi[0].src_mi->mbmi;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ mbmi = &xd->mi[0]->mbmi;
mbmi->sb_type = bsize;
for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -813,6 +1187,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
}
ctx->is_coded = 0;
ctx->skippable = 0;
+ ctx->pred_pixel_ready = 0;
x->skip_recode = 0;
// Set to zero to make sure we do not use the previous encoded frame stats
@@ -821,13 +1196,15 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
x->source_variance =
- high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+ vp9_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
+ bsize, xd->bd);
} else {
x->source_variance =
- get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
}
#else
- x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ x->source_variance =
+ vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
#endif // CONFIG_VP9_HIGHBITDEPTH
// Save rdmult before it might be changed, so it can be restored later.
@@ -845,23 +1222,15 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
: cm->last_frame_seg_map;
mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
}
-
- rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
- vp9_init_plane_quantizers(cpi, x);
- vp9_clear_system_state();
- x->rdmult = (int)round(x->rdmult * rdmult_ratio);
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == COMPLEXITY_AQ) {
- const int mi_offset = mi_row * cm->mi_cols + mi_col;
- unsigned char complexity = cpi->complexity_map[mi_offset];
- const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
- (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
- if (!is_edge && (complexity > 128))
- x->rdmult += ((x->rdmult * (complexity - 128)) / 256);
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == CYCLIC_REFRESH_AQ) {
const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
- // If segment 1, use rdmult for that segment.
- if (vp9_get_segment_id(cm, map, bsize, mi_row, mi_col))
+ // If segment is boosted, use rdmult for that segment.
+ if (cyclic_refresh_segment_id_boosted(
+ vp9_get_segment_id(cm, map, bsize, mi_row, mi_col)))
x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
}
@@ -872,21 +1241,25 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
} else {
if (bsize >= BLOCK_8X8) {
if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
- vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, rd_cost, bsize,
+ vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
ctx, best_rd);
else
- vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+ vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
rd_cost, bsize, ctx, best_rd);
} else {
- vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, rd_cost,
- bsize, ctx, best_rd);
+ vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx, best_rd);
}
}
- if (aq_mode == VARIANCE_AQ && rd_cost->rate != INT_MAX) {
- vp9_clear_system_state();
- rd_cost->rate = (int)round(rd_cost->rate * rdmult_ratio);
- rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if ((rd_cost->rate != INT_MAX) &&
+ (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) &&
+ (cm->frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+ vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
}
x->rdmult = orig_rdmult;
@@ -895,28 +1268,30 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
// refactored to provide proper exit/return handle.
if (rd_cost->rate == INT_MAX)
rd_cost->rdcost = INT64_MAX;
+
+ ctx->rate = rd_cost->rate;
+ ctx->dist = rd_cost->dist;
}
-static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
+static void update_stats(VP9_COMMON *cm, ThreadData *td) {
+ const MACROBLOCK *x = &td->mb;
const MACROBLOCKD *const xd = &x->e_mbd;
- const MODE_INFO *const mi = xd->mi[0].src_mi;
+ const MODE_INFO *const mi = xd->mi[0];
const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
if (!frame_is_intra_only(cm)) {
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mbmi);
const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_REF_FRAME);
if (!seg_ref_active) {
- FRAME_COUNTS *const counts = &cm->counts;
- const int inter_block = is_inter_block(mbmi);
-
counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
-
// If the segment reference feature is enabled we have only a single
// reference frame allowed for the segment so exclude it from
// the reference frame counts used to work out probabilities.
if (inter_block) {
const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-
if (cm->reference_mode == REFERENCE_MODE_SELECT)
counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
[has_second_ref(mbmi)]++;
@@ -933,15 +1308,33 @@ static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
}
}
}
+ if (inter_block &&
+ !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+ if (bsize >= BLOCK_8X8) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+ }
+ }
+ }
+ }
}
}
-static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE bsize) {
- MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
int p;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -949,30 +1342,29 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
int mi_width = num_8x8_blocks_wide_lookup[bsize];
int mi_height = num_8x8_blocks_high_lookup[bsize];
for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(
+ memcpy(
xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
a + num_4x4_blocks_wide * p,
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
xd->plane[p].subsampling_x);
- vpx_memcpy(
+ memcpy(
xd->left_context[p]
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
l + num_4x4_blocks_high * p,
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
xd->plane[p].subsampling_y);
}
- vpx_memcpy(xd->above_seg_context + mi_col, sa,
- sizeof(*xd->above_seg_context) * mi_width);
- vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
- sizeof(xd->left_seg_context[0]) * mi_height);
+ memcpy(xd->above_seg_context + mi_col, sa,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+ sizeof(xd->left_seg_context[0]) * mi_height);
}
-static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
+static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE bsize) {
- const MACROBLOCK *const x = &cpi->mb;
const MACROBLOCKD *const xd = &x->e_mbd;
int p;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -982,46 +1374,49 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
// buffer the above/left context information of the block in search.
for (p = 0; p < MAX_MB_PLANE; ++p) {
- vpx_memcpy(
+ memcpy(
a + num_4x4_blocks_wide * p,
xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
xd->plane[p].subsampling_x);
- vpx_memcpy(
+ memcpy(
l + num_4x4_blocks_high * p,
xd->left_context[p]
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
xd->plane[p].subsampling_y);
}
- vpx_memcpy(sa, xd->above_seg_context + mi_col,
- sizeof(*xd->above_seg_context) * mi_width);
- vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
- sizeof(xd->left_seg_context[0]) * mi_height);
+ memcpy(sa, xd->above_seg_context + mi_col,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+ sizeof(xd->left_seg_context[0]) * mi_height);
}
static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+ ThreadData *td,
TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx) {
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
- update_state(cpi, ctx, mi_row, mi_col, bsize, output_enabled);
- encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+ MACROBLOCK *const x = &td->mb;
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+ update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
+ encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
if (output_enabled) {
- update_stats(&cpi->common, &cpi->mb);
+ update_stats(&cpi->common, td);
(*tp)->token = EOSB_TOKEN;
(*tp)++;
}
}
-static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_sb(VP9_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize,
PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
@@ -1042,46 +1437,46 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
partition = partition_lookup[bsl][subsize];
if (output_enabled && bsize != BLOCK_4X4)
- cm->counts.partition[ctx][partition]++;
+ td->counts->partition[ctx][partition]++;
switch (partition) {
case PARTITION_NONE:
- encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
&pc_tree->none);
break;
case PARTITION_VERT:
- encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
&pc_tree->vertical[0]);
if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
- encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
- &pc_tree->vertical[1]);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
+ subsize, &pc_tree->vertical[1]);
}
break;
case PARTITION_HORZ:
- encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
&pc_tree->horizontal[0]);
if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
- encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
- &pc_tree->horizontal[1]);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
+ subsize, &pc_tree->horizontal[1]);
}
break;
case PARTITION_SPLIT:
if (bsize == BLOCK_8X8) {
- encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
pc_tree->leaf_split[0]);
} else {
- encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
pc_tree->split[0]);
- encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
- pc_tree->split[1]);
- encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
- pc_tree->split[2]);
- encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+ subsize, pc_tree->split[1]);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+ subsize, pc_tree->split[2]);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
subsize, pc_tree->split[3]);
}
break;
default:
- assert("Invalid partition type.");
+ assert(0 && "Invalid partition type.");
break;
}
@@ -1111,15 +1506,15 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
- BLOCK_SIZE bsize, MODE_INFO *mi_8x8) {
+ BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
int bh = bh_in;
int r, c;
for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
int bw = bw_in;
for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
const int index = r * mis + c;
- mi_8x8[index].src_mi = mi + index;
- mi_8x8[index].src_mi->mbmi.sb_type = find_partition_size(bsize,
+ mi_8x8[index] = mi + index;
+ mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
}
}
@@ -1131,7 +1526,7 @@ static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
// may not be allowed in which case this code attempts to choose the largest
// allowable partition.
static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
- MODE_INFO *mi_8x8, int mi_row, int mi_col,
+ MODE_INFO **mi_8x8, int mi_row, int mi_col,
BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mi_stride;
@@ -1150,8 +1545,8 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
int index = block_row * mis + block_col;
- mi_8x8[index].src_mi = mi_upper_left + index;
- mi_8x8[index].src_mi->mbmi.sb_type = bsize;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->mbmi.sb_type = bsize;
}
}
} else {
@@ -1161,79 +1556,6 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
}
}
-static void copy_partitioning(VP9_COMMON *cm, MODE_INFO *mi_8x8,
- MODE_INFO *prev_mi_8x8) {
- const int mis = cm->mi_stride;
- int block_row, block_col;
-
- for (block_row = 0; block_row < 8; ++block_row) {
- for (block_col = 0; block_col < 8; ++block_col) {
- MODE_INFO *const prev_mi =
- prev_mi_8x8[block_row * mis + block_col].src_mi;
- const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-
- if (prev_mi) {
- const ptrdiff_t offset = prev_mi - cm->prev_mi;
- mi_8x8[block_row * mis + block_col].src_mi = cm->mi + offset;
- mi_8x8[block_row * mis + block_col].src_mi->mbmi.sb_type = sb_type;
- }
- }
- }
-}
-
-static void constrain_copy_partitioning(VP9_COMP *const cpi,
- const TileInfo *const tile,
- MODE_INFO *mi_8x8,
- MODE_INFO *prev_mi_8x8,
- int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- const int mis = cm->mi_stride;
- const int row8x8_remaining = tile->mi_row_end - mi_row;
- const int col8x8_remaining = tile->mi_col_end - mi_col;
- MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
- const int bh = num_8x8_blocks_high_lookup[bsize];
- const int bw = num_8x8_blocks_wide_lookup[bsize];
- int block_row, block_col;
-
- assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
-
- // If the SB64 if it is all "in image".
- if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
- (row8x8_remaining >= MI_BLOCK_SIZE)) {
- for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
- for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
- const int index = block_row * mis + block_col;
- MODE_INFO *prev_mi = prev_mi_8x8[index].src_mi;
- const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
- // Use previous partition if block size is not larger than bsize.
- if (prev_mi && sb_type <= bsize) {
- int block_row2, block_col2;
- for (block_row2 = 0; block_row2 < bh; ++block_row2) {
- for (block_col2 = 0; block_col2 < bw; ++block_col2) {
- const int index2 = (block_row + block_row2) * mis +
- block_col + block_col2;
- prev_mi = prev_mi_8x8[index2].src_mi;
- if (prev_mi) {
- const ptrdiff_t offset = prev_mi - cm->prev_mi;
- mi_8x8[index2].src_mi = cm->mi + offset;
- mi_8x8[index2].src_mi->mbmi.sb_type = prev_mi->mbmi.sb_type;
- }
- }
- }
- } else {
- // Otherwise, use fixed partition of size bsize.
- mi_8x8[index].src_mi = mi_upper_left + index;
- mi_8x8[index].src_mi->mbmi.sb_type = bsize;
- }
- }
- }
- } else {
- // Else this is a partial SB64, copy previous partition.
- copy_partitioning(cm, mi_8x8, prev_mi_8x8);
- }
-}
-
const struct {
int row;
int col;
@@ -1250,10 +1572,10 @@ const struct {
static void set_source_var_based_partition(VP9_COMP *cpi,
const TileInfo *const tile,
- MODE_INFO *mi_8x8,
+ MACROBLOCK *const x,
+ MODE_INFO **mi_8x8,
int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
const int mis = cm->mi_stride;
const int row8x8_remaining = tile->mi_row_end - mi_row;
const int col8x8_remaining = tile->mi_col_end - mi_col;
@@ -1274,7 +1596,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
int use32x32 = 0;
unsigned int thr = cpi->source_var_thresh;
- vpx_memset(d32, 0, 4 * sizeof(diff));
+ memset(d32, 0, 4 * sizeof(diff));
for (i = 0; i < 4; i++) {
diff *d16[4];
@@ -1288,8 +1610,8 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
d16[j] = cpi->source_diff_var + offset + boffset;
index = b_mi_row * mis + b_mi_col;
- mi_8x8[index].src_mi = mi_upper_left + index;
- mi_8x8[index].src_mi->mbmi.sb_type = BLOCK_16X16;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->mbmi.sb_type = BLOCK_16X16;
// TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
// size to further improve quality.
@@ -1310,8 +1632,8 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10);
index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
- mi_8x8[index].src_mi = mi_upper_left + index;
- mi_8x8[index].src_mi->mbmi.sb_type = BLOCK_32X32;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->mbmi.sb_type = BLOCK_32X32;
}
}
@@ -1322,8 +1644,8 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
// Use 64x64 partition
if (is_larger_better) {
- mi_8x8[0].src_mi = mi_upper_left;
- mi_8x8[0].src_mi->mbmi.sb_type = BLOCK_64X64;
+ mi_8x8[0] = mi_upper_left;
+ mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
}
}
} else { // partial in-image SB64
@@ -1334,67 +1656,21 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
}
}
-static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, int mi_col) {
- // This assumes the input source frames are of the same dimension.
- const int row8x8_remaining = tile->mi_row_end - mi_row;
- const int col8x8_remaining = tile->mi_col_end - mi_col;
- const int x = mi_col * MI_SIZE;
- const int y = mi_row * MI_SIZE;
- const int src_stride = cpi->Source->y_stride;
- const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x];
- const int pre_stride = cpi->Last_Source->y_stride;
- const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x];
- int this_sad = 0;
- int threshold = 0;
-
- if (row8x8_remaining >= MI_BLOCK_SIZE &&
- col8x8_remaining >= MI_BLOCK_SIZE) {
- this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride);
- threshold = (1 << 12);
- } else {
- int r, c;
- for (r = 0; r < row8x8_remaining; r += 2)
- for (c = 0; c < col8x8_remaining; c += 2)
- this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
- pre, pre_stride);
- threshold = (row8x8_remaining * col8x8_remaining) << 6;
- }
-
- return this_sad < 2 * threshold;
-}
-
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO *prev_mi_8x8,
- const int motion_thresh) {
- const int mis = cm->mi_stride;
- int block_row, block_col;
-
- if (cm->prev_mi) {
- for (block_row = 0; block_row < 8; ++block_row) {
- for (block_col = 0; block_col < 8; ++block_col) {
- const MODE_INFO *prev_mi =
- prev_mi_8x8[block_row * mis + block_col].src_mi;
- if (prev_mi) {
- if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
- abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
- return 1;
- }
- }
- }
- }
- return 0;
-}
-
-static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *ctx,
int mi_row, int mi_col, int bsize) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MODE_INFO *const mi = xd->mi[0];
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const struct segmentation *const seg = &cm->seg;
+ const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+ const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+ const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = MIN(bh, cm->mi_rows - mi_row);
- *(xd->mi[0].src_mi) = ctx->mic;
- xd->mi[0].src_mi = &xd->mi[0];
+ *(xd->mi[0]) = ctx->mic;
if (seg->enabled && cpi->oxcf.aq_mode) {
// For in frame complexity AQ or variance AQ, copy segment_id from
@@ -1405,18 +1681,40 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
: cm->last_frame_seg_map;
mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
} else {
- // Setting segmentation map for cyclic_refresh
- vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, 1);
+ // Setting segmentation map for cyclic_refresh.
+ vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
+ ctx->rate, ctx->dist, x->skip);
}
vp9_init_plane_quantizers(cpi, x);
}
if (is_inter_block(mbmi)) {
- vp9_update_mv_count(cm, xd);
-
+ vp9_update_mv_count(td);
if (cm->interp_filter == SWITCHABLE) {
const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
- ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
+ ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+ }
+
+ if (mbmi->sb_type < BLOCK_8X8) {
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ }
+ }
+
+ if (cm->use_prev_frame_mvs) {
+ MV_REF *const frame_mvs =
+ cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ }
}
}
@@ -1424,33 +1722,37 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
}
-static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx) {
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
- update_state_rt(cpi, ctx, mi_row, mi_col, bsize);
+ int output_enabled, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCK *const x = &td->mb;
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+ update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) {
- vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col,
+ if (cpi->oxcf.noise_sensitivity > 0 && output_enabled &&
+ cpi->common.frame_type != KEY_FRAME) {
+ vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
MAX(BLOCK_8X8, bsize), ctx);
}
#endif
- encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
- update_stats(&cpi->common, &cpi->mb);
+ encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+ update_stats(&cpi->common, td);
(*tp)->token = EOSB_TOKEN;
(*tp)++;
}
-static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize,
PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
@@ -1463,9 +1765,9 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
if (bsize >= BLOCK_8X8) {
const int idx_str = xd->mi_stride * mi_row + mi_col;
- MODE_INFO *mi_8x8 = cm->mi[idx_str].src_mi;
+ MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
- subsize = mi_8x8[0].src_mi->mbmi.sb_type;
+ subsize = mi_8x8[0]->mbmi.sb_type;
} else {
ctx = 0;
subsize = BLOCK_4X4;
@@ -1473,42 +1775,42 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
partition = partition_lookup[bsl][subsize];
if (output_enabled && bsize != BLOCK_4X4)
- cm->counts.partition[ctx][partition]++;
+ td->counts->partition[ctx][partition]++;
switch (partition) {
case PARTITION_NONE:
- encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
&pc_tree->none);
break;
case PARTITION_VERT:
- encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
&pc_tree->vertical[0]);
if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
- encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
subsize, &pc_tree->vertical[1]);
}
break;
case PARTITION_HORZ:
- encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
&pc_tree->horizontal[0]);
if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
- encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+ encode_b_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
subsize, &pc_tree->horizontal[1]);
}
break;
case PARTITION_SPLIT:
subsize = get_subsize(bsize, PARTITION_SPLIT);
- encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
pc_tree->split[0]);
- encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+ encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
subsize, pc_tree->split[1]);
- encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+ encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
subsize, pc_tree->split[2]);
- encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[3]);
+ encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
+ output_enabled, subsize, pc_tree->split[3]);
break;
default:
- assert("Invalid partition type.");
+ assert(0 && "Invalid partition type.");
break;
}
@@ -1516,13 +1818,17 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
update_partition_context(xd, mi_row, mi_col, subsize, bsize);
}
-static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
- MODE_INFO *mi_8x8, TOKENEXTRA **tp,
+static void rd_use_partition(VP9_COMP *cpi,
+ ThreadData *td,
+ TileDataEnc *tile_data,
+ MODE_INFO **mi_8x8, TOKENEXTRA **tp,
int mi_row, int mi_col,
- BLOCK_SIZE bsize, int *rate, int64_t *dist,
+ BLOCK_SIZE bsize,
+ int *rate, int64_t *dist,
int do_recon, PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int mis = cm->mi_stride;
const int bsl = b_width_log2_lookup[bsize];
@@ -1536,7 +1842,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
RD_COST last_part_rdc, none_rdc, chosen_rdc;
BLOCK_SIZE sub_subsize = BLOCK_4X4;
int splits_below = 0;
- BLOCK_SIZE bs_type = mi_8x8[0].src_mi->mbmi.sb_type;
+ BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
int do_partition_search = 1;
PICK_MODE_CONTEXT *ctx = &pc_tree->none;
@@ -1554,10 +1860,10 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
subsize = get_subsize(bsize, partition);
pc_tree->partitioning = partition;
- save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
@@ -1570,7 +1876,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
splits_below = 1;
for (i = 0; i < 4; i++) {
int jj = i >> 1, ii = i & 0x01;
- MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss].src_mi;
+ MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
splits_below = 0;
}
@@ -1583,7 +1889,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
mi_row + (mi_step >> 1) < cm->mi_rows &&
mi_col + (mi_step >> 1) < cm->mi_cols) {
pc_tree->partitioning = PARTITION_NONE;
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rdc, bsize,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
ctx, INT64_MAX);
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -1594,19 +1900,19 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
none_rdc.dist);
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- mi_8x8[0].src_mi->mbmi.sb_type = bs_type;
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ mi_8x8[0]->mbmi.sb_type = bs_type;
pc_tree->partitioning = partition;
}
}
switch (partition) {
case PARTITION_NONE:
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
bsize, ctx, INT64_MAX);
break;
case PARTITION_HORZ:
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
subsize, &pc_tree->horizontal[0],
INT64_MAX);
if (last_part_rdc.rate != INT_MAX &&
@@ -1614,9 +1920,10 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
RD_COST tmp_rdc;
PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
vp9_rd_cost_init(&tmp_rdc);
- update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
- rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+ update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ rd_pick_sb_modes(cpi, tile_data, x,
+ mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
subsize, &pc_tree->horizontal[1], INT64_MAX);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
vp9_rd_cost_reset(&last_part_rdc);
@@ -1628,16 +1935,17 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
}
break;
case PARTITION_VERT:
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
subsize, &pc_tree->vertical[0], INT64_MAX);
if (last_part_rdc.rate != INT_MAX &&
bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
RD_COST tmp_rdc;
PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
vp9_rd_cost_init(&tmp_rdc);
- update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+ update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ rd_pick_sb_modes(cpi, tile_data, x,
+ mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
INT64_MAX);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1651,7 +1959,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
break;
case PARTITION_SPLIT:
if (bsize == BLOCK_8X8) {
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
subsize, pc_tree->leaf_split[0], INT64_MAX);
break;
}
@@ -1667,7 +1975,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
continue;
vp9_rd_cost_init(&tmp_rdc);
- rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
+ rd_use_partition(cpi, td, tile_data,
+ mi_8x8 + jj * bss * mis + ii * bss, tp,
mi_row + y_idx, mi_col + x_idx, subsize,
&tmp_rdc.rate, &tmp_rdc.dist,
i != 3, pc_tree->split[i]);
@@ -1702,7 +2011,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
chosen_rdc.rate = 0;
chosen_rdc.dist = 0;
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
pc_tree->partitioning = PARTITION_SPLIT;
// Split partition.
@@ -1716,12 +2025,13 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
- save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
pc_tree->split[i]->partitioning = PARTITION_NONE;
- rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+ rd_pick_sb_modes(cpi, tile_data, x,
+ mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
split_subsize, &pc_tree->split[i]->none, INT64_MAX);
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
vp9_rd_cost_reset(&chosen_rdc);
@@ -1732,7 +2042,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
chosen_rdc.dist += tmp_rdc.dist;
if (i != 3)
- encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
split_subsize, pc_tree->split[i]);
pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
@@ -1749,7 +2059,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
// If last_part is better set the partitioning to that.
if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
- mi_8x8[0].src_mi->mbmi.sb_type = bsize;
+ mi_8x8[0]->mbmi.sb_type = bsize;
if (bsize >= BLOCK_8X8)
pc_tree->partitioning = partition;
chosen_rdc = last_part_rdc;
@@ -1761,7 +2071,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
chosen_rdc = none_rdc;
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
// We must have chosen a partitioning and encoding or we'll fail later on.
// No other opportunities for success.
@@ -1770,19 +2080,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (do_recon) {
int output_enabled = (bsize == BLOCK_64X64);
-
- // Check the projected output rate for this SB against it's target
- // and and if necessary apply a Q delta using segmentation to get
- // closer to the target.
- if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
- vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
- output_enabled, chosen_rdc.rate);
- }
-
- if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
- vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
- chosen_rdc.rate, chosen_rdc.dist);
- encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize,
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
pc_tree);
}
@@ -1813,7 +2111,7 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
//
// The min and max are assumed to have been initialized prior to calling this
// function so repeat calls can accumulate a min and max of more than one sb64.
-static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO *mi_8x8,
+static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
BLOCK_SIZE *min_block_size,
BLOCK_SIZE *max_block_size,
int bs_hist[BLOCK_SIZES]) {
@@ -1825,7 +2123,7 @@ static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO *mi_8x8,
// Check the sb_type for each block that belongs to this region.
for (i = 0; i < sb_height_in_blocks; ++i) {
for (j = 0; j < sb_width_in_blocks; ++j) {
- MODE_INFO *mi = mi_8x8[index+j].src_mi;
+ MODE_INFO *mi = mi_8x8[index+j];
BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
bs_hist[sb_type]++;
*min_block_size = MIN(*min_block_size, sb_type);
@@ -1847,20 +2145,19 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
// Look at neighboring blocks and set a min and max partition size based on
// what they chose.
static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCKD *const xd,
int mi_row, int mi_col,
BLOCK_SIZE *min_block_size,
BLOCK_SIZE *max_block_size) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MODE_INFO *mi = xd->mi[0].src_mi;
- const int left_in_image = xd->left_available && mi[-1].src_mi;
- const int above_in_image = xd->up_available && mi[-xd->mi_stride].src_mi;
+ MODE_INFO **mi = xd->mi;
+ const int left_in_image = xd->left_available && mi[-1];
+ const int above_in_image = xd->up_available && mi[-xd->mi_stride];
const int row8x8_remaining = tile->mi_row_end - mi_row;
const int col8x8_remaining = tile->mi_col_end - mi_col;
int bh, bw;
BLOCK_SIZE min_size = BLOCK_4X4;
BLOCK_SIZE max_size = BLOCK_64X64;
- int i = 0;
int bs_hist[BLOCK_SIZES] = {0};
// Trap case where we do not have a prediction.
@@ -1873,54 +2170,27 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
// passed in values for min and max as a starting point.
// Find the min and max partition used in previous frame at this location
if (cm->frame_type != KEY_FRAME) {
- MODE_INFO *prev_mi =
- cm->prev_mip + cm->mi_stride + 1 + mi_row * xd->mi_stride + mi_col;
-
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
}
// Find the min and max partition sizes used in the left SB64
if (left_in_image) {
- MODE_INFO *left_sb64_mi = mi[-MI_BLOCK_SIZE].src_mi;
+ MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
bs_hist);
}
// Find the min and max partition sizes used in the above SB64.
if (above_in_image) {
- MODE_INFO *above_sb64_mi = mi[-xd->mi_stride * MI_BLOCK_SIZE].src_mi;
+ MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
bs_hist);
}
- // adjust observed min and max
+ // Adjust observed min and max for "relaxed" auto partition case.
if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
min_size = min_partition_size[min_size];
max_size = max_partition_size[max_size];
- } else if (cpi->sf.auto_min_max_partition_size ==
- CONSTRAIN_NEIGHBORING_MIN_MAX) {
- // adjust the search range based on the histogram of the observed
- // partition sizes from left, above the previous co-located blocks
- int sum = 0;
- int first_moment = 0;
- int second_moment = 0;
- int var_unnormalized = 0;
-
- for (i = 0; i < BLOCK_SIZES; i++) {
- sum += bs_hist[i];
- first_moment += bs_hist[i] * i;
- second_moment += bs_hist[i] * i * i;
- }
-
- // if variance is small enough,
- // adjust the range around its mean size, which gives a tighter range
- var_unnormalized = second_moment - first_moment * first_moment / sum;
- if (var_unnormalized <= 4 * sum) {
- int mean = first_moment / sum;
- min_size = min_partition_size[mean];
- max_size = max_partition_size[mean];
- } else {
- min_size = min_partition_size[min_size];
- max_size = max_partition_size[max_size];
- }
}
}
@@ -1928,7 +2198,7 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
max_size = find_partition_size(max_size,
row8x8_remaining, col8x8_remaining,
&bh, &bw);
- min_size = MIN(min_size, max_size);
+ min_size = MIN(cpi->sf.rd_auto_partition_min_limit, MIN(min_size, max_size));
// When use_square_partition_only is true, make sure at least one square
// partition is allowed by selecting the next smaller square size as
@@ -1943,15 +2213,14 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
}
static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCKD *const xd,
int mi_row, int mi_col,
BLOCK_SIZE *min_block_size,
BLOCK_SIZE *max_block_size) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MODE_INFO *mi_8x8 = xd->mi;
- const int left_in_image = xd->left_available && mi_8x8[-1].src_mi;
- const int above_in_image = xd->up_available &&
- mi_8x8[-xd->mi_stride].src_mi;
+ MODE_INFO **mi_8x8 = xd->mi;
+ const int left_in_image = xd->left_available && mi_8x8[-1];
+ const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride];
int row8x8_remaining = tile->mi_row_end - mi_row;
int col8x8_remaining = tile->mi_col_end - mi_col;
int bh, bw;
@@ -1964,15 +2233,15 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
if (search_range_ctrl &&
(left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
int block;
- MODE_INFO *mi;
+ MODE_INFO **mi;
BLOCK_SIZE sb_type;
// Find the min and max partition sizes used in the left SB64.
if (left_in_image) {
MODE_INFO *cur_mi;
- mi = mi_8x8[-1].src_mi;
+ mi = &mi_8x8[-1];
for (block = 0; block < MI_BLOCK_SIZE; ++block) {
- cur_mi = mi[block * xd->mi_stride].src_mi;
+ cur_mi = mi[block * xd->mi_stride];
sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
min_size = MIN(min_size, sb_type);
max_size = MAX(max_size, sb_type);
@@ -1980,9 +2249,9 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
}
// Find the min and max partition sizes used in the above SB64.
if (above_in_image) {
- mi = mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE].src_mi;
+ mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
for (block = 0; block < MI_BLOCK_SIZE; ++block) {
- sb_type = mi[block].src_mi ? mi[block].src_mi->mbmi.sb_type : 0;
+ sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
min_size = MIN(min_size, sb_type);
max_size = MAX(max_size, sb_type);
}
@@ -2013,9 +2282,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
MODE_INFO *mi;
const int idx_str = cm->mi_stride * mi_row + mi_col;
- MODE_INFO *prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
-
-
+ MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
BLOCK_SIZE bs, min_size, max_size;
min_size = BLOCK_64X64;
@@ -2024,7 +2291,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
if (prev_mi) {
for (idy = 0; idy < mi_height; ++idy) {
for (idx = 0; idx < mi_width; ++idx) {
- mi = prev_mi[idy * cm->mi_stride + idx].src_mi;
+ mi = prev_mi[idy * cm->mi_stride + idx];
bs = mi ? mi->mbmi.sb_type : bsize;
min_size = MIN(min_size, bs);
max_size = MAX(max_size, bs);
@@ -2034,7 +2301,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
if (xd->left_available) {
for (idy = 0; idy < mi_height; ++idy) {
- mi = xd->mi[idy * cm->mi_stride - 1].src_mi;
+ mi = xd->mi[idy * cm->mi_stride - 1];
bs = mi ? mi->mbmi.sb_type : bsize;
min_size = MIN(min_size, bs);
max_size = MAX(max_size, bs);
@@ -2043,7 +2310,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
if (xd->up_available) {
for (idx = 0; idx < mi_width; ++idx) {
- mi = xd->mi[idx - cm->mi_stride].src_mi;
+ mi = xd->mi[idx - cm->mi_stride];
bs = mi ? mi->mbmi.sb_type : bsize;
min_size = MIN(min_size, bs);
max_size = MAX(max_size, bs);
@@ -2060,11 +2327,11 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
}
static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
- vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+ memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
- vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+ memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
#if CONFIG_FP_MB_STATS
@@ -2115,12 +2382,14 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data,
TOKENEXTRA **tp, int mi_row, int mi_col,
BLOCK_SIZE bsize, RD_COST *rd_cost,
int64_t best_rd, PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -2139,8 +2408,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
const int xss = x->e_mbd.plane[1].subsampling_x;
const int yss = x->e_mbd.plane[1].subsampling_y;
- BLOCK_SIZE min_size = cpi->sf.min_partition_size;
- BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+ BLOCK_SIZE min_size = x->min_partition_size;
+ BLOCK_SIZE max_size = x->max_partition_size;
#if CONFIG_FP_MB_STATS
unsigned int src_diff_var = UINT_MAX;
@@ -2162,7 +2431,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
vp9_rd_cost_reset(&best_rdc);
best_rdc.rdcost = best_rd;
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
x->mb_energy = vp9_block_energy(cpi, x, bsize);
@@ -2190,12 +2459,12 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
partition_vert_allowed &= force_vert_split;
}
- save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
- src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
mi_row, mi_col, bsize);
}
#endif
@@ -2253,8 +2522,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
// PARTITION_NONE
if (partition_none_allowed) {
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rdc, bsize, ctx,
- best_rdc.rdcost);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+ &this_rdc, bsize, ctx, best_rdc.rdcost);
if (this_rdc.rate != INT_MAX) {
if (bsize >= BLOCK_8X8) {
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2323,9 +2592,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
}
if (skip) {
if (src_diff_var == UINT_MAX) {
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
src_diff_var = get_sby_perpixel_diff_variance(
- cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize);
+ cpi, &x->plane[0].src, mi_row, mi_col, bsize);
}
if (src_diff_var < 8) {
do_split = 0;
@@ -2336,7 +2605,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
#endif
}
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
}
// store estimated motion vector
@@ -2353,7 +2622,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
pc_tree->leaf_split[0]->pred_interp_filter =
ctx->mic.mbmi.interp_filter;
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
pc_tree->leaf_split[0], best_rdc.rdcost);
if (sum_rdc.rate == INT_MAX)
sum_rdc.rdcost = INT64_MAX;
@@ -2369,7 +2638,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
load_pred_mv(x, ctx);
pc_tree->split[i]->index = i;
- rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
+ rd_pick_partition(cpi, td, tile_data, tp,
+ mi_row + y_idx, mi_col + x_idx,
subsize, &this_rdc,
best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
@@ -2400,7 +2670,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->sf.less_rectangular_check)
do_rect &= !partition_none_allowed;
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
}
// PARTITION_HORZ
@@ -2412,14 +2682,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
partition_none_allowed)
pc_tree->horizontal[0].pred_interp_filter =
ctx->mic.mbmi.interp_filter;
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
&pc_tree->horizontal[0], best_rdc.rdcost);
if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
bsize > BLOCK_8X8) {
PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
- update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
+ update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, ctx);
@@ -2427,8 +2697,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
partition_none_allowed)
pc_tree->horizontal[1].pred_interp_filter =
ctx->mic.mbmi.interp_filter;
- rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rdc,
- subsize, &pc_tree->horizontal[1],
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+ &this_rdc, subsize, &pc_tree->horizontal[1],
best_rdc.rdcost - sum_rdc.rdcost);
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
@@ -2448,7 +2718,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
pc_tree->partitioning = PARTITION_HORZ;
}
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
}
// PARTITION_VERT
if (partition_vert_allowed && do_rect) {
@@ -2460,12 +2730,12 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
partition_none_allowed)
pc_tree->vertical[0].pred_interp_filter =
ctx->mic.mbmi.interp_filter;
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
&pc_tree->vertical[0], best_rdc.rdcost);
if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
bsize > BLOCK_8X8) {
- update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
+ update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
&pc_tree->vertical[0]);
if (cpi->sf.adaptive_motion_search)
@@ -2474,7 +2744,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
partition_none_allowed)
pc_tree->vertical[1].pred_interp_filter =
ctx->mic.mbmi.interp_filter;
- rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rdc, subsize,
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
+ &this_rdc, subsize,
&pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
@@ -2495,7 +2766,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
pc_tree->partitioning = PARTITION_VERT;
}
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
}
// TODO(jbb): This code added so that we avoid static analysis
@@ -2509,18 +2780,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
pc_tree->index != 3) {
int output_enabled = (bsize == BLOCK_64X64);
-
- // Check the projected output rate for this SB against it's target
- // and and if necessary apply a Q delta using segmentation to get
- // closer to the target.
- if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
- vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
- best_rdc.rate);
- if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
- vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
- best_rdc.rate, best_rdc.dist);
-
- encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ bsize, pc_tree);
}
if (bsize == BLOCK_64X64) {
@@ -2532,104 +2793,93 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
}
}
-static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi,
+ ThreadData *td,
+ TileDataEnc *tile_data,
+ int mi_row,
+ TOKENEXTRA **tp) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
SPEED_FEATURES *const sf = &cpi->sf;
int mi_col;
// Initialize the left context for the new SB row
- vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
- vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+ memset(&xd->left_context, 0, sizeof(xd->left_context));
+ memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
// Code each SB in the row
- for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
+ const struct segmentation *const seg = &cm->seg;
int dummy_rate;
int64_t dummy_dist;
RD_COST dummy_rdc;
int i;
+ int seg_skip = 0;
const int idx_str = cm->mi_stride * mi_row + mi_col;
- MODE_INFO *mi = cm->mi + idx_str;
- MODE_INFO *prev_mi = NULL;
-
- if (cm->frame_type != KEY_FRAME)
- prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
if (sf->adaptive_pred_interp_filter) {
for (i = 0; i < 64; ++i)
- cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+ td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
for (i = 0; i < 64; ++i) {
- cpi->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
- cpi->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
- cpi->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
- cpi->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
- }
- }
-
- vp9_zero(cpi->mb.pred_mv);
- cpi->pc_root->index = 0;
-
- // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
- // quality encoding. Need to evaluate it in real-time encoding later to
- // decide if it can be removed too. And then, do the code cleanup.
- cpi->mb.source_variance = UINT_MAX;
- if (sf->partition_search_type == FIXED_PARTITION) {
- set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
- set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
- sf->always_this_block_size);
- rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+ td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+ }
+ }
+
+ vp9_zero(x->pred_mv);
+ td->pc_root->index = 0;
+
+ if (seg->enabled) {
+ const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+ : cm->last_frame_seg_map;
+ int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ }
+
+ x->source_variance = UINT_MAX;
+ if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+ const BLOCK_SIZE bsize =
+ seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
} else if (cpi->partition_search_skippable_frame) {
BLOCK_SIZE bsize;
- set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
- bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
- set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
- rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
} else if (sf->partition_search_type == VAR_BASED_PARTITION &&
- cm->frame_type != KEY_FRAME ) {
- choose_partitioning(cpi, tile, mi_row, mi_col);
- rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, cpi->pc_root);
- } else if (sf->partition_search_type == SEARCH_PARTITION &&
- sf->use_lastframe_partitioning &&
- (cpi->rc.frames_since_key %
- sf->last_partitioning_redo_frequency) &&
- cm->prev_mi &&
- cm->show_frame &&
- cm->frame_type != KEY_FRAME &&
- !cpi->rc.is_src_frame_alt_ref &&
- ((sf->use_lastframe_partitioning !=
- LAST_FRAME_PARTITION_LOW_MOTION) ||
- !sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
- if (sf->constrain_copy_partition &&
- sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
- constrain_copy_partitioning(cpi, tile, mi, prev_mi,
- mi_row, mi_col, BLOCK_16X16);
- else
- copy_partitioning(cm, mi, prev_mi);
- rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+ cm->frame_type != KEY_FRAME) {
+ choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
} else {
// If required set upper and lower partition size limits
if (sf->auto_min_max_partition_size) {
- set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
- rd_auto_partition_range(cpi, tile, mi_row, mi_col,
- &sf->min_partition_size,
- &sf->max_partition_size);
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+ &x->min_partition_size,
+ &x->max_partition_size);
}
- rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rdc, INT64_MAX, cpi->pc_root);
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rdc, INT64_MAX, td->pc_root);
}
}
}
static void init_encode_frame_mb_context(VP9_COMP *cpi) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -2641,11 +2891,11 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(xd->above_context[0], 0,
- sizeof(*xd->above_context[0]) *
- 2 * aligned_mi_cols * MAX_MB_PLANE);
- vpx_memset(xd->above_seg_context, 0,
- sizeof(*xd->above_seg_context) * aligned_mi_cols);
+ memset(xd->above_context[0], 0,
+ sizeof(*xd->above_context[0]) *
+ 2 * aligned_mi_cols * MAX_MB_PLANE);
+ memset(xd->above_seg_context, 0,
+ sizeof(*xd->above_seg_context) * aligned_mi_cols);
}
static int check_dual_ref_flags(VP9_COMP *cpi) {
@@ -2662,12 +2912,12 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) {
int mi_row, mi_col;
const int mis = cm->mi_stride;
- MODE_INFO *mi_ptr = cm->mi;
+ MODE_INFO **mi_ptr = cm->mi_grid_visible;
for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
- if (mi_ptr[mi_col].src_mi->mbmi.tx_size > max_tx_size)
- mi_ptr[mi_col].src_mi->mbmi.tx_size = max_tx_size;
+ if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size)
+ mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
}
}
}
@@ -2683,9 +2933,13 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
return LAST_FRAME;
}
-static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
- if (cpi->mb.e_mbd.lossless)
+static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
+ if (xd->lossless)
return ONLY_4X4;
+ if (cpi->common.frame_type == KEY_FRAME &&
+ cpi->sf.use_nonrd_pick_mode &&
+ cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+ return ALLOW_16X16;
if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
return ALLOW_32X32;
else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
@@ -2695,37 +2949,59 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
return cpi->common.tx_mode;
}
-static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, int mi_col,
- int *rate, int64_t *dist,
+static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ if (bsize < BLOCK_16X16)
+ vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+ else
+ vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static void nonrd_pick_sb_modes(VP9_COMP *cpi,
+ TileDataEnc *tile_data, MACROBLOCK *const x,
+ int mi_row, int mi_col, RD_COST *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
- set_offsets(cpi, tile, mi_row, mi_col, bsize);
- mbmi = &xd->mi[0].src_mi->mbmi;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ mbmi = &xd->mi[0]->mbmi;
mbmi->sb_type = bsize;
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
- if (mbmi->segment_id && x->in_static_area)
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
- if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
- set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+ if (cm->frame_type == KEY_FRAME)
+ hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+ else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+ else if (bsize >= BLOCK_8X8)
+ vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
+ rd_cost, bsize, ctx);
else
- vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx);
+ vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col,
+ rd_cost, bsize, ctx);
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+ if (rd_cost->rate == INT_MAX)
+ vp9_rd_cost_reset(rd_cost);
+
+ ctx->rate = rd_cost->rate;
+ ctx->dist = rd_cost->dist;
}
static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
int mi_row, int mi_col,
- BLOCK_SIZE bsize, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize,
PC_TREE *pc_tree) {
MACROBLOCKD *xd = &x->e_mbd;
int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_subsize(bsize, partition);
assert(bsize >= BLOCK_8X8);
@@ -2734,41 +3010,39 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
switch (partition) {
case PARTITION_NONE:
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
- *(xd->mi[0].src_mi) = pc_tree->none.mic;
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
+ *(xd->mi[0]) = pc_tree->none.mic;
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
break;
case PARTITION_VERT:
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
- *(xd->mi[0].src_mi) = pc_tree->vertical[0].mic;
- duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
+ *(xd->mi[0]) = pc_tree->vertical[0].mic;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
if (mi_col + hbs < cm->mi_cols) {
- set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs);
- *(xd->mi[0].src_mi) = pc_tree->vertical[1].mic;
- duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col + hbs);
+ *(xd->mi[0]) = pc_tree->vertical[1].mic;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, subsize);
}
break;
case PARTITION_HORZ:
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
- *(xd->mi[0].src_mi) = pc_tree->horizontal[0].mic;
- duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+ set_mode_info_offsets(cm, xd, mi_row, mi_col);
+ *(xd->mi[0]) = pc_tree->horizontal[0].mic;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
if (mi_row + hbs < cm->mi_rows) {
- set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col);
- *(xd->mi[0].src_mi) = pc_tree->horizontal[1].mic;
- duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize);
+ set_mode_info_offsets(cm, xd, mi_row + hbs, mi_col);
+ *(xd->mi[0]) = pc_tree->horizontal[1].mic;
+ duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, subsize);
}
break;
case PARTITION_SPLIT: {
- BLOCK_SIZE subsubsize = get_subsize(subsize, PARTITION_SPLIT);
- fill_mode_info_sb(cm, x, mi_row, mi_col, subsize,
- subsubsize, pc_tree->split[0]);
+ fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
- subsubsize, pc_tree->split[1]);
+ pc_tree->split[1]);
fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
- subsubsize, pc_tree->split[2]);
+ pc_tree->split[2]);
fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
- subsubsize, pc_tree->split[3]);
+ pc_tree->split[3]);
break;
}
default:
@@ -2776,24 +3050,39 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
}
}
-static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+// Reset the prediction pixel ready flag recursively.
+static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+ pc_tree->none.pred_pixel_ready = 0;
+ pc_tree->horizontal[0].pred_pixel_ready = 0;
+ pc_tree->horizontal[1].pred_pixel_ready = 0;
+ pc_tree->vertical[0].pred_pixel_ready = 0;
+ pc_tree->vertical[1].pred_pixel_ready = 0;
+
+ if (bsize > BLOCK_8X8) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+ int i;
+ for (i = 0; i < 4; ++i)
+ pred_pixel_ready_reset(pc_tree->split[i], subsize);
+ }
+}
+
+static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data,
TOKENEXTRA **tp, int mi_row,
- int mi_col, BLOCK_SIZE bsize, int *rate,
- int64_t *dist, int do_recon, int64_t best_rd,
+ int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost,
+ int do_recon, int64_t best_rd,
PC_TREE *pc_tree) {
const SPEED_FEATURES *const sf = &cpi->sf;
- const VP9EncoderConfig *const oxcf = &cpi->oxcf;
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
TOKENEXTRA *tp_orig = *tp;
PICK_MODE_CONTEXT *ctx = &pc_tree->none;
int i;
BLOCK_SIZE subsize = bsize;
- int this_rate, sum_rate = 0, best_rate = INT_MAX;
- int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
- int64_t sum_rd = 0;
+ RD_COST this_rdc, sum_rdc, best_rdc;
int do_split = bsize >= BLOCK_8X8;
int do_rect = 1;
// Override skipping rectangular partition operations for edge blocks
@@ -2812,38 +3101,47 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
assert(num_8x8_blocks_wide_lookup[bsize] ==
num_8x8_blocks_high_lookup[bsize]);
+ vp9_rd_cost_init(&sum_rdc);
+ vp9_rd_cost_reset(&best_rdc);
+ best_rdc.rdcost = best_rd;
+
// Determine partition types in search according to the speed features.
// The threshold set here has to be of square block size.
if (sf->auto_min_max_partition_size) {
- partition_none_allowed &= (bsize <= sf->max_partition_size &&
- bsize >= sf->min_partition_size);
- partition_horz_allowed &= ((bsize <= sf->max_partition_size &&
- bsize > sf->min_partition_size) ||
+ partition_none_allowed &= (bsize <= x->max_partition_size &&
+ bsize >= x->min_partition_size);
+ partition_horz_allowed &= ((bsize <= x->max_partition_size &&
+ bsize > x->min_partition_size) ||
force_horz_split);
- partition_vert_allowed &= ((bsize <= sf->max_partition_size &&
- bsize > sf->min_partition_size) ||
+ partition_vert_allowed &= ((bsize <= x->max_partition_size &&
+ bsize > x->min_partition_size) ||
force_vert_split);
- do_split &= bsize > sf->min_partition_size;
+ do_split &= bsize > x->min_partition_size;
}
if (sf->use_square_partition_only) {
partition_horz_allowed &= force_horz_split;
partition_vert_allowed &= force_vert_split;
}
+ ctx->pred_pixel_ready = !(partition_vert_allowed ||
+ partition_horz_allowed ||
+ do_split);
+
// PARTITION_NONE
if (partition_none_allowed) {
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
- &this_rate, &this_dist, bsize, ctx);
- ctx->mic.mbmi = xd->mi[0].src_mi->mbmi;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+ &this_rdc, bsize, ctx);
+ ctx->mic.mbmi = xd->mi[0]->mbmi;
ctx->skip_txfm[0] = x->skip_txfm[0];
ctx->skip = x->skip;
- if (this_rate != INT_MAX) {
+ if (this_rdc.rate != INT_MAX) {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += cpi->partition_cost[pl][PARTITION_NONE];
- sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
- if (sum_rd < best_rd) {
- int dist_breakout_thr = sf->partition_search_breakout_dist_thr;
+ this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
@@ -2851,15 +3149,13 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
rate_breakout_thr *= num_pels_log2_lookup[bsize];
- best_rate = this_rate;
- best_dist = this_dist;
- best_rd = sum_rd;
+ best_rdc = this_rdc;
if (bsize >= BLOCK_8X8)
pc_tree->partitioning = PARTITION_NONE;
if (!x->e_mbd.lossless &&
- this_rate < rate_breakout_thr &&
- this_dist < dist_breakout_thr) {
+ this_rdc.rate < rate_breakout_thr &&
+ this_rdc.dist < dist_breakout_thr) {
do_split = 0;
do_rect = 0;
}
@@ -2871,35 +3167,34 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
store_pred_mv(x, ctx);
// PARTITION_SPLIT
- sum_rd = 0;
if (do_split) {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+ sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
subsize = get_subsize(bsize, PARTITION_SPLIT);
- for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+ for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
const int x_idx = (i & 1) * ms;
const int y_idx = (i >> 1) * ms;
if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
load_pred_mv(x, ctx);
- nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
- subsize, &this_rate, &this_dist, 0,
- best_rd - sum_rd, pc_tree->split[i]);
+ nonrd_pick_partition(cpi, td, tile_data, tp,
+ mi_row + y_idx, mi_col + x_idx,
+ subsize, &this_rdc, 0,
+ best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
- if (this_rate == INT_MAX) {
- sum_rd = INT64_MAX;
+ if (this_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(&sum_rdc);
} else {
- sum_rate += this_rate;
- sum_dist += this_dist;
- sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
}
}
- if (sum_rd < best_rd) {
- best_rate = sum_rate;
- best_dist = sum_dist;
- best_rd = sum_rd;
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
pc_tree->partitioning = PARTITION_SPLIT;
} else {
// skip rectangular partition test when larger block size
@@ -2914,300 +3209,432 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
subsize = get_subsize(bsize, PARTITION_HORZ);
if (sf->adaptive_motion_search)
load_pred_mv(x, ctx);
-
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
- &this_rate, &this_dist, subsize,
+ pc_tree->horizontal[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
&pc_tree->horizontal[0]);
- pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[0].skip = x->skip;
- sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-
- if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+ if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
load_pred_mv(x, ctx);
- nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
- &this_rate, &this_dist, subsize,
+ pc_tree->horizontal[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + ms, mi_col,
+ &this_rdc, subsize,
&pc_tree->horizontal[1]);
- pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[1].skip = x->skip;
- if (this_rate == INT_MAX) {
- sum_rd = INT64_MAX;
+ if (this_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(&sum_rdc);
} else {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += cpi->partition_cost[pl][PARTITION_HORZ];
- sum_rate += this_rate;
- sum_dist += this_dist;
- sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ sum_rdc.rate, sum_rdc.dist);
}
}
- if (sum_rd < best_rd) {
- best_rd = sum_rd;
- best_rate = sum_rate;
- best_dist = sum_dist;
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
pc_tree->partitioning = PARTITION_HORZ;
+ } else {
+ pred_pixel_ready_reset(pc_tree, bsize);
}
}
// PARTITION_VERT
if (partition_vert_allowed && do_rect) {
subsize = get_subsize(bsize, PARTITION_VERT);
-
if (sf->adaptive_motion_search)
load_pred_mv(x, ctx);
-
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
- &this_rate, &this_dist, subsize,
+ pc_tree->vertical[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
&pc_tree->vertical[0]);
- pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
pc_tree->vertical[0].skip = x->skip;
- sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
- if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+
+ if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
load_pred_mv(x, ctx);
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
- &this_rate, &this_dist, subsize,
+ pc_tree->vertical[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms,
+ &this_rdc, subsize,
&pc_tree->vertical[1]);
- pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->vertical[1].skip = x->skip;
- if (this_rate == INT_MAX) {
- sum_rd = INT64_MAX;
+
+ if (this_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(&sum_rdc);
} else {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += cpi->partition_cost[pl][PARTITION_VERT];
- sum_rate += this_rate;
- sum_dist += this_dist;
- sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ sum_rdc.rate, sum_rdc.dist);
}
}
- if (sum_rd < best_rd) {
- best_rate = sum_rate;
- best_dist = sum_dist;
- best_rd = sum_rd;
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
pc_tree->partitioning = PARTITION_VERT;
+ } else {
+ pred_pixel_ready_reset(pc_tree, bsize);
}
}
- // TODO(JBB): The following line is here just to avoid a static warning
- // that occurs because at this point we never again reuse best_rd
- // despite setting it here. The code should be refactored to avoid this.
- (void) best_rd;
- *rate = best_rate;
- *dist = best_dist;
+ *rd_cost = best_rdc;
- if (best_rate == INT_MAX)
+ if (best_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(rd_cost);
return;
+ }
// update mode info array
- subsize = get_subsize(bsize, pc_tree->partitioning);
- fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize,
- pc_tree);
+ fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, pc_tree);
- if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
int output_enabled = (bsize == BLOCK_64X64);
-
- // Check the projected output rate for this SB against it's target
- // and and if necessary apply a Q delta using segmentation to get
- // closer to the target.
- if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
- vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
- best_rate);
- }
-
- if (oxcf->aq_mode == CYCLIC_REFRESH_AQ)
- vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
- best_rate, best_dist);
-
- encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+ encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ bsize, pc_tree);
}
- if (bsize == BLOCK_64X64) {
+ if (bsize == BLOCK_64X64 && do_recon) {
assert(tp_orig < *tp);
- assert(best_rate < INT_MAX);
- assert(best_dist < INT64_MAX);
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
} else {
assert(tp_orig == *tp);
}
}
+static void nonrd_select_partition(VP9_COMP *cpi,
+ ThreadData *td,
+ TileDataEnc *tile_data,
+ MODE_INFO **mi,
+ TOKENEXTRA **tp,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int output_enabled,
+ RD_COST *rd_cost, PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+ const int mis = cm->mi_stride;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+ RD_COST this_rdc;
+
+ vp9_rd_cost_reset(&this_rdc);
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+ return;
+
+ subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
+ partition = partition_lookup[bsl][subsize];
+
+ if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+ subsize >= BLOCK_16X16) {
+ x->max_partition_size = BLOCK_32X32;
+ x->min_partition_size = BLOCK_8X8;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+ rd_cost, 0, INT64_MAX, pc_tree);
+ } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+ x->max_partition_size = BLOCK_16X16;
+ x->min_partition_size = BLOCK_8X8;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+ rd_cost, 0, INT64_MAX, pc_tree);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ pc_tree->none.pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ subsize, &pc_tree->none);
+ pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->none.skip = x->skip;
+ break;
+ case PARTITION_VERT:
+ pc_tree->vertical[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ subsize, &pc_tree->vertical[0]);
+ pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[0].skip = x->skip;
+ if (mi_col + hbs < cm->mi_cols) {
+ pc_tree->vertical[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+ &this_rdc, subsize, &pc_tree->vertical[1]);
+ pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[1].skip = x->skip;
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ }
+ break;
+ case PARTITION_HORZ:
+ pc_tree->horizontal[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ subsize, &pc_tree->horizontal[0]);
+ pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[0].skip = x->skip;
+ if (mi_row + hbs < cm->mi_rows) {
+ pc_tree->horizontal[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+ &this_rdc, subsize, &pc_tree->horizontal[1]);
+ pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[1].skip = x->skip;
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ }
+ break;
+ case PARTITION_SPLIT:
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ subsize, output_enabled, rd_cost,
+ pc_tree->split[0]);
+ nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp,
+ mi_row, mi_col + hbs, subsize, output_enabled,
+ &this_rdc, pc_tree->split[1]);
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+ mi_row + hbs, mi_col, subsize, output_enabled,
+ &this_rdc, pc_tree->split[2]);
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+ mi_row + hbs, mi_col + hbs, subsize,
+ output_enabled, &this_rdc, pc_tree->split[3]);
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ break;
+ default:
+ assert(0 && "Invalid partition type.");
+ break;
+ }
+ }
+
+ if (bsize == BLOCK_64X64 && output_enabled)
+ encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree);
+}
+
+
static void nonrd_use_partition(VP9_COMP *cpi,
- const TileInfo *const tile,
- MODE_INFO *mi,
+ ThreadData *td,
+ TileDataEnc *tile_data,
+ MODE_INFO **mi,
TOKENEXTRA **tp,
int mi_row, int mi_col,
BLOCK_SIZE bsize, int output_enabled,
- int *totrate, int64_t *totdist,
- PC_TREE *pc_tree) {
+ RD_COST *dummy_cost, PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
const int mis = cm->mi_stride;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
- int rate = INT_MAX;
- int64_t dist = INT64_MAX;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- subsize = (bsize >= BLOCK_8X8) ? mi[0].src_mi->mbmi.sb_type : BLOCK_4X4;
+ subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
partition = partition_lookup[bsl][subsize];
+ if (output_enabled && bsize != BLOCK_4X4) {
+ int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ td->counts->partition[ctx][partition]++;
+ }
+
switch (partition) {
case PARTITION_NONE:
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+ pc_tree->none.pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
subsize, &pc_tree->none);
- pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
pc_tree->none.skip = x->skip;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, &pc_tree->none);
break;
case PARTITION_VERT:
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+ pc_tree->vertical[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
subsize, &pc_tree->vertical[0]);
- pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
pc_tree->vertical[0].skip = x->skip;
- if (mi_col + hbs < cm->mi_cols) {
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
- &rate, &dist, subsize, &pc_tree->vertical[1]);
- pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, &pc_tree->vertical[0]);
+ if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+ pc_tree->vertical[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+ dummy_cost, subsize, &pc_tree->vertical[1]);
+ pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->vertical[1].skip = x->skip;
- if (rate != INT_MAX && dist != INT64_MAX &&
- *totrate != INT_MAX && *totdist != INT64_MAX) {
- *totrate += rate;
- *totdist += dist;
- }
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs,
+ output_enabled, subsize, &pc_tree->vertical[1]);
}
break;
case PARTITION_HORZ:
- nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+ pc_tree->horizontal[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
subsize, &pc_tree->horizontal[0]);
- pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[0].skip = x->skip;
- if (mi_row + hbs < cm->mi_rows) {
- nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
- &rate, &dist, subsize, &pc_tree->horizontal[0]);
- pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, &pc_tree->horizontal[0]);
+
+ if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+ pc_tree->horizontal[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+ dummy_cost, subsize, &pc_tree->horizontal[1]);
+ pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[1].skip = x->skip;
- if (rate != INT_MAX && dist != INT64_MAX &&
- *totrate != INT_MAX && *totdist != INT64_MAX) {
- *totrate += rate;
- *totdist += dist;
- }
+ encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col,
+ output_enabled, subsize, &pc_tree->horizontal[1]);
}
break;
case PARTITION_SPLIT:
subsize = get_subsize(bsize, PARTITION_SPLIT);
- nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
- subsize, output_enabled, totrate, totdist,
- pc_tree->split[0]);
- nonrd_use_partition(cpi, tile, mi + hbs, tp,
- mi_row, mi_col + hbs, subsize, output_enabled,
- &rate, &dist, pc_tree->split[1]);
- if (rate != INT_MAX && dist != INT64_MAX &&
- *totrate != INT_MAX && *totdist != INT64_MAX) {
- *totrate += rate;
- *totdist += dist;
- }
- nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
- mi_row + hbs, mi_col, subsize, output_enabled,
- &rate, &dist, pc_tree->split[2]);
- if (rate != INT_MAX && dist != INT64_MAX &&
- *totrate != INT_MAX && *totdist != INT64_MAX) {
- *totrate += rate;
- *totdist += dist;
- }
- nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
- mi_row + hbs, mi_col + hbs, subsize, output_enabled,
- &rate, &dist, pc_tree->split[3]);
- if (rate != INT_MAX && dist != INT64_MAX &&
- *totrate != INT_MAX && *totdist != INT64_MAX) {
- *totrate += rate;
- *totdist += dist;
+ if (bsize == BLOCK_8X8) {
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+ subsize, pc_tree->leaf_split[0]);
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col,
+ output_enabled, subsize, pc_tree->leaf_split[0]);
+ } else {
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ subsize, output_enabled, dummy_cost,
+ pc_tree->split[0]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp,
+ mi_row, mi_col + hbs, subsize, output_enabled,
+ dummy_cost, pc_tree->split[1]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+ mi_row + hbs, mi_col, subsize, output_enabled,
+ dummy_cost, pc_tree->split[2]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+ mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+ dummy_cost, pc_tree->split[3]);
}
break;
default:
- assert("Invalid partition type.");
+ assert(0 && "Invalid partition type.");
break;
}
- if (bsize == BLOCK_64X64 && output_enabled) {
- if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
- vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
- *totrate, *totdist);
- encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
- }
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
}
-static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, TOKENEXTRA **tp) {
+static void encode_nonrd_sb_row(VP9_COMP *cpi,
+ ThreadData *td,
+ TileDataEnc *tile_data,
+ int mi_row,
+ TOKENEXTRA **tp) {
SPEED_FEATURES *const sf = &cpi->sf;
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
int mi_col;
// Initialize the left context for the new SB row
- vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
- vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+ memset(&xd->left_context, 0, sizeof(xd->left_context));
+ memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
// Code each SB in the row
- for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- int dummy_rate = 0;
- int64_t dummy_dist = 0;
+ const struct segmentation *const seg = &cm->seg;
+ RD_COST dummy_rdc;
const int idx_str = cm->mi_stride * mi_row + mi_col;
- MODE_INFO *mi = cm->mi + idx_str;
- BLOCK_SIZE bsize;
- x->in_static_area = 0;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
+ BLOCK_SIZE bsize = BLOCK_64X64;
+ int seg_skip = 0;
x->source_variance = UINT_MAX;
vp9_zero(x->pred_mv);
+ vp9_rd_cost_init(&dummy_rdc);
+ x->color_sensitivity[0] = 0;
+ x->color_sensitivity[1] = 0;
+
+ if (seg->enabled) {
+ const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+ : cm->last_frame_seg_map;
+ int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ if (seg_skip) {
+ partition_search_type = FIXED_PARTITION;
+ }
+ }
// Set the partition type of the 64X64 block
- switch (sf->partition_search_type) {
+ switch (partition_search_type) {
case VAR_BASED_PARTITION:
- choose_partitioning(cpi, tile, mi_row, mi_col);
- nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- 1, &dummy_rate, &dummy_dist, cpi->pc_root);
+ // TODO(jingning, marpan): The mode decision and encoding process
+ // support both intra and inter sub8x8 block coding for RTC mode.
+ // Tune the thresholds accordingly to use sub8x8 block coding for
+ // coding performance improvement.
+ choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
break;
case SOURCE_VAR_BASED_PARTITION:
- set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
- nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- 1, &dummy_rate, &dummy_dist, cpi->pc_root);
+ set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
break;
case FIXED_PARTITION:
- bsize = sf->partition_search_type == FIXED_PARTITION ?
- sf->always_this_block_size :
- get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
- set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
- nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
- 1, &dummy_rate, &dummy_dist, cpi->pc_root);
+ if (!seg_skip)
+ bsize = sf->always_this_block_size;
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
break;
case REFERENCE_PARTITION:
- if (sf->partition_check ||
- !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
- set_modeinfo_offsets(cm, xd, mi_row, mi_col);
- auto_partition_range(cpi, tile, mi_row, mi_col,
- &sf->min_partition_size,
- &sf->max_partition_size);
- nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX,
- cpi->pc_root);
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+ xd->mi[0]->mbmi.segment_id) {
+ x->max_partition_size = BLOCK_64X64;
+ x->min_partition_size = BLOCK_8X8;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rdc, 1,
+ INT64_MAX, td->pc_root);
} else {
- choose_partitioning(cpi, tile, mi_row, mi_col);
- nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
- BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
- cpi->pc_root);
+ choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
}
+
break;
default:
assert(0);
@@ -3230,13 +3657,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
const int cutoff = (MIN(cm->width, cm->height) >= 720) ?
(cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) :
(cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
- DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS);
+ DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
diff *var16 = cpi->source_diff_var;
int sum = 0;
int i, j;
- vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
+ memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
for (i = 0; i < cm->mb_rows; i++) {
for (j = 0; j < cm->mb_cols; j++) {
@@ -3315,9 +3742,9 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
if (cpi->source_diff_var)
vpx_free(cpi->source_diff_var);
- CHECK_MEM_ERROR(cm, cpi->source_diff_var,
- vpx_calloc(cm->MBs, sizeof(diff)));
- }
+ CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+ vpx_calloc(cm->MBs, sizeof(diff)));
+ }
if (!cpi->frames_till_next_var_check)
cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
@@ -3329,13 +3756,13 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
}
}
-static int get_skip_encode_frame(const VP9_COMMON *cm) {
+static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
unsigned int intra_count = 0, inter_count = 0;
int j;
for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
- intra_count += cm->counts.intra_inter[j][0];
- inter_count += cm->counts.intra_inter[j][1];
+ intra_count += td->counts->intra_inter[j][0];
+ inter_count += td->counts->intra_inter[j][1];
}
return (intra_count << 2) < inter_count &&
@@ -3343,47 +3770,80 @@ static int get_skip_encode_frame(const VP9_COMMON *cm) {
cm->show_frame;
}
-static void encode_tiles(VP9_COMP *cpi) {
- const VP9_COMMON *const cm = &cpi->common;
+void vp9_init_tile_data(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
-
int tile_col, tile_row;
- TileInfo tile[4][1 << 6];
- TOKENEXTRA *tok[4][1 << 6];
- TOKENEXTRA *pre_tok = cpi->tok;
+ TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
int tile_tok = 0;
- for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
-
- tok[tile_row][tile_col] = pre_tok + tile_tok;
- pre_tok = tok[tile_row][tile_col];
- tile_tok = allocated_tokens(tile[tile_row][tile_col]);
- }
+ if (cpi->tile_data == NULL) {
+ CHECK_MEM_ERROR(cm, cpi->tile_data,
+ vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ int i, j;
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ for (j = 0; j < MAX_MODES; ++j) {
+ tile_data->thresh_freq_fact[i][j] = 32;
+ tile_data->mode_map[i][j] = j;
+ }
+ }
+ }
}
for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- const TileInfo * const ptile = &tile[tile_row][tile_col];
- TOKENEXTRA * const old_tok = tok[tile_row][tile_col];
- int mi_row;
+ TileInfo *tile_info =
+ &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+ vp9_tile_init(tile_info, cm, tile_row, tile_col);
- for (mi_row = ptile->mi_row_start; mi_row < ptile->mi_row_end;
- mi_row += MI_BLOCK_SIZE) {
- if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
- encode_nonrd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
- else
- encode_rd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
- }
- cpi->tok_count[tile_row][tile_col] =
- (unsigned int)(tok[tile_row][tile_col] - old_tok);
- assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*ptile));
+ cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+ pre_tok = cpi->tile_tok[tile_row][tile_col];
+ tile_tok = allocated_tokens(*tile_info);
}
}
}
+void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td,
+ int tile_row, int tile_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ TileDataEnc *this_tile =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo * const tile_info = &this_tile->tile_info;
+ TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+ int mi_row;
+
+ for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ if (cpi->sf.use_nonrd_pick_mode)
+ encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ else
+ encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ }
+ cpi->tok_count[tile_row][tile_col] =
+ (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+ assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+ allocated_tokens(*tile_info));
+}
+
+static void encode_tiles(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int tile_col, tile_row;
+
+ vp9_init_tile_data(cpi);
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+ vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
#if CONFIG_FP_MB_STATS
static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
@@ -3402,18 +3862,20 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
static void encode_frame_internal(VP9_COMP *cpi) {
SPEED_FEATURES *const sf = &cpi->sf;
RD_OPT *const rd_opt = &cpi->rd;
- MACROBLOCK *const x = &cpi->mb;
+ ThreadData *const td = &cpi->td;
+ MACROBLOCK *const x = &td->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
- xd->mi = cm->mi;
- xd->mi[0].src_mi = &xd->mi[0];
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
- vp9_zero(cm->counts);
- vp9_zero(cpi->coef_counts);
- vp9_zero(rd_opt->comp_pred_diff);
- vp9_zero(rd_opt->filter_diff);
- vp9_zero(rd_opt->tx_select_diff);
+ vp9_zero(*td->counts);
+ vp9_zero(rdc->coef_counts);
+ vp9_zero(rdc->comp_pred_diff);
+ vp9_zero(rdc->filter_diff);
+ vp9_zero(rdc->tx_select_diff);
vp9_zero(rd_opt->tx_select_threshes);
xd->lossless = cm->base_qindex == 0 &&
@@ -3421,13 +3883,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
- cm->tx_mode = select_tx_mode(cpi);
-
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
- else
x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+ else
+ x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
vp9_highbd_idct4x4_add;
#else
@@ -3435,18 +3895,25 @@ static void encode_frame_internal(VP9_COMP *cpi) {
#endif // CONFIG_VP9_HIGHBITDEPTH
x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
- if (xd->lossless) {
+ if (xd->lossless)
x->optimize = 0;
- cm->lf.filter_level = 0;
- cpi->zbin_mode_boost_enabled = 0;
- }
+
+ cm->tx_mode = select_tx_mode(cpi, xd);
vp9_frame_init_quantizer(cpi);
vp9_initialize_rd_consts(cpi);
- vp9_initialize_me_consts(cpi, cm->base_qindex);
+ vp9_initialize_me_consts(cpi, x, cm->base_qindex);
init_encode_frame_mb_context(cpi);
- set_prev_mi(cm);
+ cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+ cm->width == cm->last_width &&
+ cm->height == cm->last_height &&
+ !cm->intra_only &&
+ cm->last_show_frame;
+ // Special case: set prev_mi to NULL when the previous mode info
+ // context cannot be used.
+ cm->prev_mi = cm->use_prev_frame_mvs ?
+ cm->prev_mip + cm->mi_stride + 1 : NULL;
x->quant_fp = cpi->sf.use_quant_fp;
vp9_zero(x->skip_txfm);
@@ -3456,7 +3923,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
int i;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+ PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
for (i = 0; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][0];
@@ -3466,6 +3933,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
}
vp9_zero(x->zcoeff_blk);
+ if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0)
+ cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+
if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
source_var_based_partition_search_method(cpi);
}
@@ -3481,13 +3951,18 @@ static void encode_frame_internal(VP9_COMP *cpi) {
}
#endif
- encode_tiles(cpi);
+ // If allowed, encoding tiles in parallel with one thread handling one tile.
+ if (MIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+ vp9_encode_tiles_mt(cpi);
+ else
+ encode_tiles(cpi);
vpx_usec_timer_mark(&emr_timer);
cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
}
- sf->skip_encode_frame = sf->skip_encode_sb ? get_skip_encode_frame(cm) : 0;
+ sf->skip_encode_frame = sf->skip_encode_sb ?
+ get_skip_encode_frame(cm, td) : 0;
#if 0
// Keep record of the total distortion this time around for future use
@@ -3514,7 +3989,6 @@ static INTERP_FILTER get_interp_filter(
void vp9_encode_frame(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- RD_OPT *const rd_opt = &cpi->rd;
// In the longer term the encoder should be generalized to match the
// decoder such that we allow compound where one of the 3 buffers has a
@@ -3527,9 +4001,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
(cm->ref_frame_sign_bias[ALTREF_FRAME] ==
cm->ref_frame_sign_bias[LAST_FRAME])) {
- cm->allow_comp_inter_inter = 0;
+ cpi->allow_comp_inter_inter = 0;
} else {
- cm->allow_comp_inter_inter = 1;
+ cpi->allow_comp_inter_inter = 1;
cm->comp_fixed_ref = ALTREF_FRAME;
cm->comp_var_ref[0] = LAST_FRAME;
cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3538,6 +4012,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
if (cpi->sf.frame_parameter_update) {
int i;
+ RD_OPT *const rd_opt = &cpi->rd;
+ FRAME_COUNTS *counts = cpi->td.counts;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
// This code does a single RD pass over the whole frame assuming
// either compound, single or hybrid prediction as per whatever has
@@ -3553,7 +4030,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
const int is_alt_ref = frame_type == ALTREF_FRAME;
/* prediction (compound, single or hybrid) mode selection */
- if (is_alt_ref || !cm->allow_comp_inter_inter)
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
cm->reference_mode = SINGLE_REFERENCE;
else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
mode_thrs[COMPOUND_REFERENCE] >
@@ -3572,15 +4049,16 @@ void vp9_encode_frame(VP9_COMP *cpi) {
encode_frame_internal(cpi);
for (i = 0; i < REFERENCE_MODES; ++i)
- mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2;
+ mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2;
+ filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
for (i = 0; i < TX_MODES; ++i) {
- int64_t pd = rd_opt->tx_select_diff[i];
+ int64_t pd = rdc->tx_select_diff[i];
if (i == TX_MODE_SELECT)
- pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0);
+ pd -= RDCOST(cpi->td.mb.rdmult, cpi->td.mb.rddiv, 2048 * (TX_SIZES - 1),
+ 0);
tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2;
}
@@ -3589,16 +4067,16 @@ void vp9_encode_frame(VP9_COMP *cpi) {
int comp_count_zero = 0;
for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
- single_count_zero += cm->counts.comp_inter[i][0];
- comp_count_zero += cm->counts.comp_inter[i][1];
+ single_count_zero += counts->comp_inter[i][0];
+ comp_count_zero += counts->comp_inter[i][1];
}
if (comp_count_zero == 0) {
cm->reference_mode = SINGLE_REFERENCE;
- vp9_zero(cm->counts.comp_inter);
+ vp9_zero(counts->comp_inter);
} else if (single_count_zero == 0) {
cm->reference_mode = COMPOUND_REFERENCE;
- vp9_zero(cm->counts.comp_inter);
+ vp9_zero(counts->comp_inter);
}
}
@@ -3609,19 +4087,18 @@ void vp9_encode_frame(VP9_COMP *cpi) {
int count32x32 = 0;
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
- count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
- count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
+ count4x4 += counts->tx.p32x32[i][TX_4X4];
+ count4x4 += counts->tx.p16x16[i][TX_4X4];
+ count4x4 += counts->tx.p8x8[i][TX_4X4];
- count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
- count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
- count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
+ count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+ count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+ count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
- count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
- count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
- count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+ count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+ count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+ count32x32 += counts->tx.p32x32[i][TX_32X32];
}
-
if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
count32x32 == 0) {
cm->tx_mode = ALLOW_8X8;
@@ -3662,32 +4139,15 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
++counts->uv_mode[y_mode][uv_mode];
}
-static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
- if (enabled) {
- if (is_inter_block(mbmi)) {
- if (mbmi->mode == ZEROMV) {
- return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST
- : LF_ZEROMV_ZBIN_BOOST;
- } else {
- return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST
- : MV_ZBIN_BOOST;
- }
- } else {
- return INTRA_ZBIN_BOOST;
- }
- } else {
- return 0;
- }
-}
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
+ TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *mi_8x8 = xd->mi;
- MODE_INFO *mi = mi_8x8;
+ MODE_INFO **mi_8x8 = xd->mi;
+ MODE_INFO *mi = mi_8x8[0];
MB_MODE_INFO *mbmi = &mi->mbmi;
const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
@@ -3701,7 +4161,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
cpi->sf.allow_skip_recode;
if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode)
- vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
x->skip_optimize = ctx->is_coded;
ctx->is_coded = 1;
@@ -3714,36 +4174,31 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
- // Experimental code. Special case for gf and arf zeromv modes.
- // Increase zbin size to suppress noise
- cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi,
- cpi->zbin_mode_boost_enabled);
- vp9_update_zbin_extra(cpi, x);
-
if (!is_inter_block(mbmi)) {
int plane;
mbmi->skip = 1;
for (plane = 0; plane < MAX_MB_PLANE; ++plane)
vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
if (output_enabled)
- sum_intra_stats(&cm->counts, mi);
- vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+ sum_intra_stats(td->counts, mi);
+ vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
} else {
int ref;
const int is_compound = has_second_ref(mbmi);
for (ref = 0; ref < 1 + is_compound; ++ref) {
YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
mbmi->ref_frame[ref]);
+ assert(cfg != NULL);
vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
&xd->block_refs[ref]->sf);
}
- if (!cpi->sf.reuse_inter_pred_sby || seg_skip)
+ if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
- vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+ vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
}
if (output_enabled) {
@@ -3751,7 +4206,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
mbmi->sb_type >= BLOCK_8X8 &&
!(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
- &cm->counts.tx)[mbmi->tx_size];
+ &td->counts->tx)[mbmi->tx_size];
} else {
int x, y;
TX_SIZE tx_size;
@@ -3766,7 +4221,9 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
for (y = 0; y < mi_height; y++)
for (x = 0; x < mi_width; x++)
if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
- mi_8x8[mis * y + x].src_mi->mbmi.tx_size = tx_size;
+ mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
}
+ ++td->counts->tx.tx_totals[mbmi->tx_size];
+ ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
index fd1c9aa6427..1acde0283e9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -12,6 +12,8 @@
#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
#define VP9_ENCODER_VP9_ENCODEFRAME_H_
+#include "vpx/vpx_integer.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -19,6 +21,7 @@ extern "C" {
struct macroblock;
struct yv12_buffer_config;
struct VP9_COMP;
+struct ThreadData;
// Constants used in SOURCE_VAR_BASED_PARTITION
#define VAR_HIST_MAX_BG_VAR 1000
@@ -33,6 +36,12 @@ void vp9_setup_src_planes(struct macroblock *x,
void vp9_encode_frame(struct VP9_COMP *cpi);
+void vp9_init_tile_data(struct VP9_COMP *cpi);
+void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col);
+
+void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
index f5faa7c23ac..9a4e61ec882 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -29,12 +29,6 @@ struct optimize_ctx {
ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
};
-struct encode_b_args {
- MACROBLOCK *x;
- struct optimize_ctx *ctx;
- int8_t *skip;
-};
-
void vp9_subtract_block_c(int rows, int cols,
int16_t *diff, ptrdiff_t diff_stride,
const uint8_t *src, ptrdiff_t src_stride,
@@ -99,7 +93,7 @@ typedef struct vp9_token_state {
int rate;
int error;
int next;
- signed char token;
+ int16_t token;
short qc;
} vp9_token_state;
@@ -134,7 +128,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
MACROBLOCKD *const xd = &mb->e_mbd;
struct macroblock_plane *const p = &mb->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
+ const int ref = is_inter_block(&xd->mi[0]->mbmi);
vp9_token_state tokens[1025][2];
unsigned best_index[1025][2];
uint8_t token_cache[1024];
@@ -153,10 +147,15 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
int next = eob, sz = 0;
int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
int64_t rd_cost0, rd_cost1;
- int rate0, rate1, error0, error1, t0, t1;
+ int rate0, rate1, error0, error1;
+ int16_t t0, t1;
+ EXTRABIT e0;
int best, band, pt, i, final_eob;
- const TOKENVALUE *dct_value_tokens;
- const int16_t *dct_value_cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
assert((!type && !plane) || (type && plane));
assert(eob <= default_eob);
@@ -173,24 +172,9 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
tokens[eob][0].qc = 0;
tokens[eob][1] = tokens[eob][0];
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->bd == 12) {
- dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
- dct_value_cost = vp9_dct_value_cost_high12_ptr;
- } else if (xd->bd == 10) {
- dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
- dct_value_cost = vp9_dct_value_cost_high10_ptr;
- } else {
- dct_value_tokens = vp9_dct_value_tokens_ptr;
- dct_value_cost = vp9_dct_value_cost_ptr;
- }
-#else
- dct_value_tokens = vp9_dct_value_tokens_ptr;
- dct_value_cost = vp9_dct_value_cost_ptr;
-#endif
for (i = 0; i < eob; i++)
token_cache[scan[i]] =
- vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
+ vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
for (i = eob; i-- > 0;) {
int base_bits, d2, dx;
@@ -204,7 +188,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
/* Evaluate the first possibility for this state. */
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
- t0 = (dct_value_tokens + x)->token;
+ vp9_get_token_extra(x, &t0, &e0);
/* Consider both possible successor states. */
if (next < default_eob) {
band = band_translate[i + 1];
@@ -217,7 +201,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
UPDATE_RD_COST();
/* And pick the best. */
best = rd_cost1 < rd_cost0;
- base_bits = dct_value_cost[x];
+ base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
dx = mul * (dqcoeff[rc] - coeff[rc]);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -255,8 +239,10 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
*/
t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+ e0 = 0;
} else {
- t0 = t1 = (dct_value_tokens + x)->token;
+ vp9_get_token_extra(x, &t0, &e0);
+ t1 = t0;
}
if (next < default_eob) {
band = band_translate[i + 1];
@@ -275,7 +261,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
UPDATE_RD_COST();
/* And pick the best. */
best = rd_cost1 < rd_cost0;
- base_bits = dct_value_cost[x];
+ base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
if (shortcut) {
#if CONFIG_VP9_HIGHBITDEPTH
@@ -333,8 +319,8 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
UPDATE_RD_COST();
best = rd_cost1 < rd_cost0;
final_eob = -1;
- vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
- vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+ memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+ memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
for (i = next; i < eob; i = next) {
const int x = tokens[i][best].qc;
const int rc = scan[i];
@@ -397,28 +383,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round_fp, p->quant_fp, p->quant_shift,
qcoeff, dqcoeff, pd->dequant,
- p->zbin_extra, eob, scan_order->scan,
+ eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -433,28 +419,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
- vp9_fdct8x8(src_diff, coeff, diff_stride);
- vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
- scan_order->scan, scan_order->iscan);
+ vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+ x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -490,19 +476,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
break;
case TX_16X16:
vp9_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
- vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+ vp9_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff,
pd->dequant[0], eob);
break;
case TX_8X8:
vp9_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
- vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+ vp9_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff,
pd->dequant[0], eob);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
- vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+ vp9_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff,
pd->dequant[0], eob);
break;
@@ -522,19 +508,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
break;
case TX_16X16:
vp9_fdct16x16_1(src_diff, coeff, diff_stride);
- vp9_quantize_dc(coeff, x->skip_block, p->round,
+ vp9_quantize_dc(coeff, 256, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff,
pd->dequant[0], eob);
break;
case TX_8X8:
vp9_fdct8x8_1(src_diff, coeff, diff_stride);
- vp9_quantize_dc(coeff, x->skip_block, p->round,
+ vp9_quantize_dc(coeff, 64, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff,
pd->dequant[0], eob);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
- vp9_quantize_dc(coeff, x->skip_block, p->round,
+ vp9_quantize_dc(coeff, 16, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff,
pd->dequant[0], eob);
break;
@@ -567,28 +553,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round, p->quant, p->quant_shift, qcoeff,
- dqcoeff, pd->dequant, p->zbin_extra, eob,
+ dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_16X16:
vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -603,28 +589,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_fdct8x8(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
@@ -659,24 +645,34 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
}
if (!x->skip_recode) {
- if (max_txsize_lookup[plane_bsize] == tx_size) {
- if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
- // full forward transform and quantization
- if (x->quant_fp)
- vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
- else
- vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
- } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
- // fast path forward transform and quantization
- vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
- } else {
+ if (x->quant_fp) {
+ // Encoding process for rtc mode
+ if (x->skip_txfm[0] == 1 && plane == 0) {
// skip forward transform
p->eobs[block] = 0;
*a = *l = 0;
return;
+ } else {
+ vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
}
} else {
- vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ if (max_txsize_lookup[plane_bsize] == tx_size) {
+ int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
+ if (x->skip_txfm[txfm_blk_index] == 0) {
+ // full forward transform and quantization
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ } else if (x->skip_txfm[txfm_blk_index]== 2) {
+ // fast path forward transform and quantization
+ vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ } else {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+ return;
+ }
+ } else {
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ }
}
}
@@ -777,7 +773,7 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
struct encode_b_args arg = {x, &ctx, &mbmi->skip};
int plane;
@@ -802,12 +798,12 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
}
}
-static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
@@ -845,8 +841,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant,
- p->zbin_extra, eob,
+ qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (!x->skip_encode && *eob) {
@@ -867,7 +862,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (!x->skip_encode && *eob) {
@@ -889,7 +884,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
if (!x->skip_encode && *eob) {
@@ -900,7 +895,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
case TX_4X4:
tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
scan_order = &vp9_scan_orders[TX_4X4][tx_type];
- mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+ mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
x->skip_encode ? src : dst,
x->skip_encode ? src_stride : dst_stride,
@@ -915,7 +910,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob,
+ pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
@@ -954,7 +949,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
if (!x->skip_encode && *eob)
@@ -974,7 +969,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
if (!x->skip_encode && *eob)
@@ -994,7 +989,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
if (!x->skip_encode && *eob)
@@ -1003,7 +998,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
case TX_4X4:
tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
scan_order = &vp9_scan_orders[TX_4X4][tx_type];
- mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+ mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
x->skip_encode ? src : dst,
x->skip_encode ? src_stride : dst_stride,
@@ -1018,7 +1013,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan_order->scan,
+ pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
@@ -1040,18 +1035,10 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
*(args->skip) = 0;
}
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- int8_t *skip) {
- struct encode_b_args arg = {x, NULL, skip};
- encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
-}
-
-
void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
const MACROBLOCKD *const xd = &x->e_mbd;
- struct encode_b_args arg = {x, NULL, &xd->mi[0].src_mi->mbmi.skip};
+ struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
- vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
- &arg);
+ vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+ vp9_encode_block_intra, &arg);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
index 54d2b375110..97df8a66be7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
@@ -18,6 +18,11 @@
extern "C" {
#endif
+struct encode_b_args {
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+ int8_t *skip;
+};
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
@@ -29,9 +34,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- int8_t *skip);
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
index 08983956763..af73fcbdcc3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
@@ -161,10 +161,10 @@ static void write_mv_update(const vp9_tree_index *tree,
update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
}
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) {
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+ nmv_context_counts *const counts) {
int i, j;
- nmv_context *const mvc = &cm->fc.nmvc;
- nmv_context_counts *const counts = &cm->counts.mv;
+ nmv_context *const mvc = &cm->fc->nmvc;
write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
@@ -241,8 +241,9 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2],
}
}
-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
- const MODE_INFO *mi = xd->mi[0].src_mi;
+void vp9_update_mv_count(ThreadData *td) {
+ const MACROBLOCKD *xd = &td->mb.e_mbd;
+ const MODE_INFO *mi = xd->mi[0];
const MB_MODE_INFO *const mbmi = &mi->mbmi;
if (mbmi->sb_type < BLOCK_8X8) {
@@ -254,12 +255,12 @@ void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
for (idx = 0; idx < 2; idx += num_4x4_w) {
const int i = idy * 2 + idx;
if (mi->bmi[i].as_mode == NEWMV)
- inc_mvs(mbmi, mi->bmi[i].as_mv, &cm->counts.mv);
+ inc_mvs(mbmi, mi->bmi[i].as_mv, &td->counts->mv);
}
}
} else {
if (mbmi->mode == NEWMV)
- inc_mvs(mbmi, mbmi->mv, &cm->counts.mv);
+ inc_mvs(mbmi, mbmi->mv, &td->counts->mv);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
index e67f9e3b075..0ae473749ab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
@@ -20,7 +20,8 @@ extern "C" {
void vp9_entropy_mv_init();
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+ nmv_context_counts *const counts);
void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
const nmv_context* mvctx, int usehp);
@@ -28,7 +29,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
const nmv_context* mvctx, int usehp);
-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd);
+void vp9_update_mv_count(ThreadData *td);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
index 1758e3fdb90..a1018adb88f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -13,6 +13,8 @@
#include <limits.h>
#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx/internal/vpx_psnr.h"
#include "vpx_ports/vpx_timer.h"
@@ -35,22 +37,25 @@
#include "vp9/encoder/vp9_context_tree.h"
#include "vp9/encoder/vp9_encodeframe.h"
#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_picklpf.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_resize.h"
#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_skin_detection.h"
#include "vp9/encoder/vp9_speed_features.h"
#if CONFIG_INTERNAL_STATS
#include "vp9/encoder/vp9_ssim.h"
#endif
-#include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/vp9_resize.h"
#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
-void vp9_coef_tree_initialize();
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
@@ -60,12 +65,14 @@ void vp9_coef_tree_initialize();
// mv. Choose a very high value for
// now so that HIGH_PRECISION is always
// chosen.
-
// #define OUTPUT_YUV_REC
#ifdef OUTPUT_YUV_DENOISED
FILE *yuv_denoised_file = NULL;
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
#ifdef OUTPUT_YUV_REC
FILE *yuv_rec_file;
#endif
@@ -102,8 +109,111 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
}
}
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+void vp9_suppress_active_map(VP9_COMP *cpi) {
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i;
+ if (cpi->active_map.enabled || cpi->active_map.update)
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+ seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+void vp9_apply_active_map(VP9_COMP *cpi) {
+ struct segmentation *const seg = &cpi->common.seg;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ const unsigned char *const active_map = cpi->active_map.map;
+ int i;
+
+ assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+ if (frame_is_intra_only(&cpi->common)) {
+ cpi->active_map.enabled = 0;
+ cpi->active_map.update = 1;
+ }
+
+ if (cpi->active_map.update) {
+ if (cpi->active_map.enabled) {
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+ vp9_enable_segmentation(seg);
+ vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+ // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+ // filter level being zero regardless of the value of seg->abs_delta.
+ vp9_set_segdata(seg, AM_SEGMENT_ID_INACTIVE,
+ SEG_LVL_ALT_LF, -MAX_LOOP_FILTER);
+ } else {
+ vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+ if (seg->enabled) {
+ seg->update_data = 1;
+ seg->update_map = 1;
+ }
+ }
+ cpi->active_map.update = 0;
+ }
+}
+
+int vp9_set_active_map(VP9_COMP* cpi,
+ unsigned char* new_map_16x16,
+ int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+ unsigned char *const active_map_8x8 = cpi->active_map.map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ cpi->active_map.update = 1;
+ if (new_map_16x16) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ active_map_8x8[r * mi_cols + c] =
+ new_map_16x16[(r >> 1) * cols + (c >> 1)]
+ ? AM_SEGMENT_ID_ACTIVE
+ : AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ cpi->active_map.enabled = 1;
+ } else {
+ cpi->active_map.enabled = 0;
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int vp9_get_active_map(VP9_COMP* cpi,
+ unsigned char* new_map_16x16,
+ int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+ new_map_16x16) {
+ unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+ seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
- MACROBLOCK *const mb = &cpi->mb;
+ MACROBLOCK *const mb = &cpi->td.mb;
cpi->common.allow_high_precision_mv = allow_high_precision_mv;
if (cpi->common.allow_high_precision_mv) {
mb->mvcost = mb->nmvcost_hp;
@@ -134,23 +244,87 @@ static void setup_frame(VP9_COMP *cpi) {
cpi->refresh_alt_ref_frame = 1;
vp9_zero(cpi->interp_filter_selected);
} else {
- cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ *cm->fc = cm->frame_contexts[cm->frame_context_idx];
vp9_zero(cpi->interp_filter_selected[0]);
}
}
-void vp9_initialize_enc() {
- static int init_done = 0;
+static void vp9_enc_setup_mi(VP9_COMMON *cm) {
+ int i;
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+ // Clear top border row
+ memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+ // Clear left border column
+ for (i = 1; i < cm->mi_rows + 1; ++i)
+ memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+ memset(cm->mi_grid_base, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
+ cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+ if (!cm->mip)
+ return 1;
+ cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+ if (!cm->prev_mip)
+ return 1;
+ cm->mi_alloc_size = mi_size;
+
+ cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+ if (!cm->mi_grid_base)
+ return 1;
+ cm->prev_mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+ if (!cm->prev_mi_grid_base)
+ return 1;
+
+ return 0;
+}
+
+static void vp9_enc_free_mi(VP9_COMMON *cm) {
+ vpx_free(cm->mip);
+ cm->mip = NULL;
+ vpx_free(cm->prev_mip);
+ cm->prev_mip = NULL;
+ vpx_free(cm->mi_grid_base);
+ cm->mi_grid_base = NULL;
+ vpx_free(cm->prev_mi_grid_base);
+ cm->prev_mi_grid_base = NULL;
+}
+
+static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MODE_INFO **temp_base = cm->prev_mi_grid_base;
+ MODE_INFO *temp = cm->prev_mip;
+ cm->prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+ cm->prev_mi_grid_base = cm->mi_grid_base;
+ cm->mi_grid_base = temp_base;
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void vp9_initialize_enc(void) {
+ static volatile int init_done = 0;
if (!init_done) {
vp9_rtcd();
+ vpx_dsp_rtcd();
+ vpx_scale_rtcd();
vp9_init_intra_predictors();
- vp9_coef_tree_initialize();
- vp9_tokenize_initialize();
vp9_init_me_luts();
vp9_rc_init_minq_luts();
vp9_entropy_mv_init();
- vp9_entropy_mode_init();
vp9_temporal_filter_init();
init_done = 1;
}
@@ -160,17 +334,15 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
int i;
+ vpx_free(cpi->tile_data);
+ cpi->tile_data = NULL;
+
// Delete sementation map
vpx_free(cpi->segmentation_map);
cpi->segmentation_map = NULL;
- vpx_free(cm->last_frame_seg_map);
- cm->last_frame_seg_map = NULL;
vpx_free(cpi->coding_context.last_frame_seg_map_copy);
cpi->coding_context.last_frame_seg_map_copy = NULL;
- vpx_free(cpi->complexity_map);
- cpi->complexity_map = NULL;
-
vpx_free(cpi->nmvcosts[0]);
vpx_free(cpi->nmvcosts[1]);
cpi->nmvcosts[0] = NULL;
@@ -194,7 +366,13 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vp9_cyclic_refresh_free(cpi->cyclic_refresh);
cpi->cyclic_refresh = NULL;
- vp9_free_ref_frame_buffers(cm);
+ vpx_free(cpi->active_map.map);
+ cpi->active_map.map = NULL;
+
+ vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(cm);
+#endif
vp9_free_context_buffers(cm);
vp9_free_frame_buffer(&cpi->last_frame_uf);
@@ -203,10 +381,10 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vp9_free_frame_buffer(&cpi->alt_ref_buffer);
vp9_lookahead_destroy(cpi->lookahead);
- vpx_free(cpi->tok);
- cpi->tok = 0;
+ vpx_free(cpi->tile_tok[0][0]);
+ cpi->tile_tok[0][0] = 0;
- vp9_free_pc_tree(cpi);
+ vp9_free_pc_tree(&cpi->td);
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
@@ -223,11 +401,11 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
vp9_free_frame_buffer(&cpi->svc.scaled_frames[i]);
}
- vpx_memset(&cpi->svc.scaled_frames[0], 0,
- MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+ memset(&cpi->svc.scaled_frames[0], 0,
+ MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
vp9_free_frame_buffer(&cpi->svc.empty_frame.img);
- vpx_memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
+ memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
}
static void save_coding_context(VP9_COMP *cpi) {
@@ -238,26 +416,26 @@ static void save_coding_context(VP9_COMP *cpi) {
// restored with a call to vp9_restore_coding_context. These functions are
// intended for use in a re-code loop in vp9_compress_frame where the
// quantizer value is adjusted between loop iterations.
- vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost);
+ vp9_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
- vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
- MV_VALS * sizeof(*cpi->nmvcosts[0]));
- vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
- MV_VALS * sizeof(*cpi->nmvcosts[1]));
- vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
- MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
- vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
- MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+ memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+ MV_VALS * sizeof(*cpi->nmvcosts[0]));
+ memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+ MV_VALS * sizeof(*cpi->nmvcosts[1]));
+ memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+ memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
- vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
- cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
+ memcpy(cpi->coding_context.last_frame_seg_map_copy,
+ cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
- cc->fc = cm->fc;
+ cc->fc = *cm->fc;
}
static void restore_coding_context(VP9_COMP *cpi) {
@@ -266,27 +444,25 @@ static void restore_coding_context(VP9_COMP *cpi) {
// Restore key state variables to the snapshot state stored in the
// previous call to vp9_save_coding_context.
- vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+ vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
- vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
- MV_VALS * sizeof(*cc->nmvcosts[0]));
- vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1],
- MV_VALS * sizeof(*cc->nmvcosts[1]));
- vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
- MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
- vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
- MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+ memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
+ memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
+ memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+ memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
- vpx_memcpy(cm->last_frame_seg_map,
- cpi->coding_context.last_frame_seg_map_copy,
- (cm->mi_rows * cm->mi_cols));
+ memcpy(cm->last_frame_seg_map,
+ cpi->coding_context.last_frame_seg_map_copy,
+ (cm->mi_rows * cm->mi_cols));
vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
- cm->fc = cc->fc;
+ *cm->fc = cc->fc;
}
static void configure_static_seg_features(VP9_COMP *cpi) {
@@ -300,7 +476,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
// Disable and clear down for KF
if (cm->frame_type == KEY_FRAME) {
// Clear down the global segmentation map
- vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
seg->update_map = 0;
seg->update_data = 0;
cpi->static_mb_pct = 0;
@@ -313,7 +489,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
} else if (cpi->refresh_alt_ref_frame) {
// If this is an alt ref frame
// Clear down the global segmentation map
- vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
seg->update_map = 0;
seg->update_data = 0;
cpi->static_mb_pct = 0;
@@ -374,7 +550,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
vp9_disable_segmentation(seg);
- vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
seg->update_map = 0;
seg->update_data = 0;
@@ -415,15 +591,15 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
static void update_reference_segmentation_map(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- MODE_INFO *mi_8x8_ptr = cm->mi;
+ MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
uint8_t *cache_ptr = cm->last_frame_seg_map;
int row, col;
for (row = 0; row < cm->mi_rows; row++) {
- MODE_INFO *mi_8x8 = mi_8x8_ptr;
+ MODE_INFO **mi_8x8 = mi_8x8_ptr;
uint8_t *cache = cache_ptr;
for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
- cache[0] = mi_8x8[0].src_mi->mbmi.segment_id;
+ cache[0] = mi_8x8[0]->mbmi.segment_id;
mi_8x8_ptr += cm->mi_stride;
cache_ptr += cm->mi_cols;
}
@@ -433,8 +609,9 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
const VP9EncoderConfig *oxcf = &cpi->oxcf;
- cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
- cm->subsampling_x, cm->subsampling_y,
+ if (!cpi->lookahead)
+ cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
+ cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
@@ -443,24 +620,19 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate lag buffers");
+ // TODO(agrange) Check if ARF is enabled and skip allocation if not.
if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
oxcf->width, oxcf->height,
cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate altref buffer");
}
-static void alloc_ref_frame_buffers(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- if (vp9_alloc_ref_frame_buffers(cm, cm->width, cm->height))
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate frame buffers");
-}
-
static void alloc_util_frame_buffers(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
@@ -469,7 +641,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
@@ -479,7 +652,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled source buffer");
@@ -489,7 +663,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled last source buffer");
}
@@ -499,19 +674,20 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
vp9_alloc_context_buffers(cm, cm->width, cm->height);
- vpx_free(cpi->tok);
+ vpx_free(cpi->tile_tok[0][0]);
{
unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
- CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+ vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
}
- vp9_setup_pc_tree(&cpi->common, cpi);
+ vp9_setup_pc_tree(&cpi->common, &cpi->td);
}
static void update_frame_size(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
vp9_set_mb_mi(cm, cm->width, cm->height);
vp9_init_context_buffers(cm);
@@ -524,7 +700,8 @@ static void update_frame_size(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
}
@@ -541,9 +718,14 @@ static void set_tile_limits(VP9_COMP *cpi) {
int min_log2_tile_cols, max_log2_tile_cols;
vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
- cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
- min_log2_tile_cols, max_log2_tile_cols);
- cm->log2_tile_rows = cpi->oxcf.tile_rows;
+ if (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING) {
+ cm->log2_tile_cols = 0;
+ cm->log2_tile_rows = 0;
+ } else {
+ cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+ min_log2_tile_cols, max_log2_tile_cols);
+ cm->log2_tile_rows = cpi->oxcf.tile_rows;
+ }
}
static void init_buffer_indices(VP9_COMP *cpi) {
@@ -563,12 +745,15 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth = oxcf->use_highbitdepth;
#endif
- cm->color_space = UNKNOWN;
+ cm->color_space = oxcf->color_space;
cm->width = oxcf->width;
cm->height = oxcf->height;
vp9_alloc_compressor_data(cpi);
+ // Single thread case: use counts in common.
+ cpi->td.counts = &cm->counts;
+
// Spatial scalability.
cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
// Temporal scalability.
@@ -577,7 +762,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
((cpi->svc.number_temporal_layers > 1 ||
cpi->svc.number_spatial_layers > 1) &&
- cpi->oxcf.pass == 2)) {
+ cpi->oxcf.pass != 1)) {
vp9_init_layer_context(cpi);
}
@@ -746,61 +931,61 @@ static void fnname##_bits12(const uint8_t *src_ptr, \
sad_array[i] >>= 4; \
}
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x16)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x16x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x32)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x32x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x32)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x32x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x64)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x64x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x32)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad32x32x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x32x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x64)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad64x64x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x64x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x16)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x16x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x16x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x8)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x8x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x8x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x16)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x16x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x16x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x8)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x8x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x8x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x4)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x4x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x4x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x8)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x8x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x8x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x4)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x4x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
static void highbd_set_var_fns(VP9_COMP *const cpi) {
VP9_COMMON *const cm = &cpi->common;
@@ -808,398 +993,398 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
switch (cm->bit_depth) {
case VPX_BITS_8:
HIGHBD_BFP(BLOCK_32X16,
- vp9_highbd_sad32x16_bits8,
- vp9_highbd_sad32x16_avg_bits8,
+ vpx_highbd_sad32x16_bits8,
+ vpx_highbd_sad32x16_avg_bits8,
vp9_highbd_variance32x16,
vp9_highbd_sub_pixel_variance32x16,
vp9_highbd_sub_pixel_avg_variance32x16,
NULL,
NULL,
- vp9_highbd_sad32x16x4d_bits8)
+ vpx_highbd_sad32x16x4d_bits8)
HIGHBD_BFP(BLOCK_16X32,
- vp9_highbd_sad16x32_bits8,
- vp9_highbd_sad16x32_avg_bits8,
+ vpx_highbd_sad16x32_bits8,
+ vpx_highbd_sad16x32_avg_bits8,
vp9_highbd_variance16x32,
vp9_highbd_sub_pixel_variance16x32,
vp9_highbd_sub_pixel_avg_variance16x32,
NULL,
NULL,
- vp9_highbd_sad16x32x4d_bits8)
+ vpx_highbd_sad16x32x4d_bits8)
HIGHBD_BFP(BLOCK_64X32,
- vp9_highbd_sad64x32_bits8,
- vp9_highbd_sad64x32_avg_bits8,
+ vpx_highbd_sad64x32_bits8,
+ vpx_highbd_sad64x32_avg_bits8,
vp9_highbd_variance64x32,
vp9_highbd_sub_pixel_variance64x32,
vp9_highbd_sub_pixel_avg_variance64x32,
NULL,
NULL,
- vp9_highbd_sad64x32x4d_bits8)
+ vpx_highbd_sad64x32x4d_bits8)
HIGHBD_BFP(BLOCK_32X64,
- vp9_highbd_sad32x64_bits8,
- vp9_highbd_sad32x64_avg_bits8,
+ vpx_highbd_sad32x64_bits8,
+ vpx_highbd_sad32x64_avg_bits8,
vp9_highbd_variance32x64,
vp9_highbd_sub_pixel_variance32x64,
vp9_highbd_sub_pixel_avg_variance32x64,
NULL,
NULL,
- vp9_highbd_sad32x64x4d_bits8)
+ vpx_highbd_sad32x64x4d_bits8)
HIGHBD_BFP(BLOCK_32X32,
- vp9_highbd_sad32x32_bits8,
- vp9_highbd_sad32x32_avg_bits8,
+ vpx_highbd_sad32x32_bits8,
+ vpx_highbd_sad32x32_avg_bits8,
vp9_highbd_variance32x32,
vp9_highbd_sub_pixel_variance32x32,
vp9_highbd_sub_pixel_avg_variance32x32,
- vp9_highbd_sad32x32x3_bits8,
- vp9_highbd_sad32x32x8_bits8,
- vp9_highbd_sad32x32x4d_bits8)
+ vpx_highbd_sad32x32x3_bits8,
+ vpx_highbd_sad32x32x8_bits8,
+ vpx_highbd_sad32x32x4d_bits8)
HIGHBD_BFP(BLOCK_64X64,
- vp9_highbd_sad64x64_bits8,
- vp9_highbd_sad64x64_avg_bits8,
+ vpx_highbd_sad64x64_bits8,
+ vpx_highbd_sad64x64_avg_bits8,
vp9_highbd_variance64x64,
vp9_highbd_sub_pixel_variance64x64,
vp9_highbd_sub_pixel_avg_variance64x64,
- vp9_highbd_sad64x64x3_bits8,
- vp9_highbd_sad64x64x8_bits8,
- vp9_highbd_sad64x64x4d_bits8)
+ vpx_highbd_sad64x64x3_bits8,
+ vpx_highbd_sad64x64x8_bits8,
+ vpx_highbd_sad64x64x4d_bits8)
HIGHBD_BFP(BLOCK_16X16,
- vp9_highbd_sad16x16_bits8,
- vp9_highbd_sad16x16_avg_bits8,
+ vpx_highbd_sad16x16_bits8,
+ vpx_highbd_sad16x16_avg_bits8,
vp9_highbd_variance16x16,
vp9_highbd_sub_pixel_variance16x16,
vp9_highbd_sub_pixel_avg_variance16x16,
- vp9_highbd_sad16x16x3_bits8,
- vp9_highbd_sad16x16x8_bits8,
- vp9_highbd_sad16x16x4d_bits8)
+ vpx_highbd_sad16x16x3_bits8,
+ vpx_highbd_sad16x16x8_bits8,
+ vpx_highbd_sad16x16x4d_bits8)
HIGHBD_BFP(BLOCK_16X8,
- vp9_highbd_sad16x8_bits8,
- vp9_highbd_sad16x8_avg_bits8,
+ vpx_highbd_sad16x8_bits8,
+ vpx_highbd_sad16x8_avg_bits8,
vp9_highbd_variance16x8,
vp9_highbd_sub_pixel_variance16x8,
vp9_highbd_sub_pixel_avg_variance16x8,
- vp9_highbd_sad16x8x3_bits8,
- vp9_highbd_sad16x8x8_bits8,
- vp9_highbd_sad16x8x4d_bits8)
+ vpx_highbd_sad16x8x3_bits8,
+ vpx_highbd_sad16x8x8_bits8,
+ vpx_highbd_sad16x8x4d_bits8)
HIGHBD_BFP(BLOCK_8X16,
- vp9_highbd_sad8x16_bits8,
- vp9_highbd_sad8x16_avg_bits8,
+ vpx_highbd_sad8x16_bits8,
+ vpx_highbd_sad8x16_avg_bits8,
vp9_highbd_variance8x16,
vp9_highbd_sub_pixel_variance8x16,
vp9_highbd_sub_pixel_avg_variance8x16,
- vp9_highbd_sad8x16x3_bits8,
- vp9_highbd_sad8x16x8_bits8,
- vp9_highbd_sad8x16x4d_bits8)
+ vpx_highbd_sad8x16x3_bits8,
+ vpx_highbd_sad8x16x8_bits8,
+ vpx_highbd_sad8x16x4d_bits8)
HIGHBD_BFP(BLOCK_8X8,
- vp9_highbd_sad8x8_bits8,
- vp9_highbd_sad8x8_avg_bits8,
+ vpx_highbd_sad8x8_bits8,
+ vpx_highbd_sad8x8_avg_bits8,
vp9_highbd_variance8x8,
vp9_highbd_sub_pixel_variance8x8,
vp9_highbd_sub_pixel_avg_variance8x8,
- vp9_highbd_sad8x8x3_bits8,
- vp9_highbd_sad8x8x8_bits8,
- vp9_highbd_sad8x8x4d_bits8)
+ vpx_highbd_sad8x8x3_bits8,
+ vpx_highbd_sad8x8x8_bits8,
+ vpx_highbd_sad8x8x4d_bits8)
HIGHBD_BFP(BLOCK_8X4,
- vp9_highbd_sad8x4_bits8,
- vp9_highbd_sad8x4_avg_bits8,
+ vpx_highbd_sad8x4_bits8,
+ vpx_highbd_sad8x4_avg_bits8,
vp9_highbd_variance8x4,
vp9_highbd_sub_pixel_variance8x4,
vp9_highbd_sub_pixel_avg_variance8x4,
NULL,
- vp9_highbd_sad8x4x8_bits8,
- vp9_highbd_sad8x4x4d_bits8)
+ vpx_highbd_sad8x4x8_bits8,
+ vpx_highbd_sad8x4x4d_bits8)
HIGHBD_BFP(BLOCK_4X8,
- vp9_highbd_sad4x8_bits8,
- vp9_highbd_sad4x8_avg_bits8,
+ vpx_highbd_sad4x8_bits8,
+ vpx_highbd_sad4x8_avg_bits8,
vp9_highbd_variance4x8,
vp9_highbd_sub_pixel_variance4x8,
vp9_highbd_sub_pixel_avg_variance4x8,
NULL,
- vp9_highbd_sad4x8x8_bits8,
- vp9_highbd_sad4x8x4d_bits8)
+ vpx_highbd_sad4x8x8_bits8,
+ vpx_highbd_sad4x8x4d_bits8)
HIGHBD_BFP(BLOCK_4X4,
- vp9_highbd_sad4x4_bits8,
- vp9_highbd_sad4x4_avg_bits8,
+ vpx_highbd_sad4x4_bits8,
+ vpx_highbd_sad4x4_avg_bits8,
vp9_highbd_variance4x4,
vp9_highbd_sub_pixel_variance4x4,
vp9_highbd_sub_pixel_avg_variance4x4,
- vp9_highbd_sad4x4x3_bits8,
- vp9_highbd_sad4x4x8_bits8,
- vp9_highbd_sad4x4x4d_bits8)
+ vpx_highbd_sad4x4x3_bits8,
+ vpx_highbd_sad4x4x8_bits8,
+ vpx_highbd_sad4x4x4d_bits8)
break;
case VPX_BITS_10:
HIGHBD_BFP(BLOCK_32X16,
- vp9_highbd_sad32x16_bits10,
- vp9_highbd_sad32x16_avg_bits10,
+ vpx_highbd_sad32x16_bits10,
+ vpx_highbd_sad32x16_avg_bits10,
vp9_highbd_10_variance32x16,
vp9_highbd_10_sub_pixel_variance32x16,
vp9_highbd_10_sub_pixel_avg_variance32x16,
NULL,
NULL,
- vp9_highbd_sad32x16x4d_bits10)
+ vpx_highbd_sad32x16x4d_bits10)
HIGHBD_BFP(BLOCK_16X32,
- vp9_highbd_sad16x32_bits10,
- vp9_highbd_sad16x32_avg_bits10,
+ vpx_highbd_sad16x32_bits10,
+ vpx_highbd_sad16x32_avg_bits10,
vp9_highbd_10_variance16x32,
vp9_highbd_10_sub_pixel_variance16x32,
vp9_highbd_10_sub_pixel_avg_variance16x32,
NULL,
NULL,
- vp9_highbd_sad16x32x4d_bits10)
+ vpx_highbd_sad16x32x4d_bits10)
HIGHBD_BFP(BLOCK_64X32,
- vp9_highbd_sad64x32_bits10,
- vp9_highbd_sad64x32_avg_bits10,
+ vpx_highbd_sad64x32_bits10,
+ vpx_highbd_sad64x32_avg_bits10,
vp9_highbd_10_variance64x32,
vp9_highbd_10_sub_pixel_variance64x32,
vp9_highbd_10_sub_pixel_avg_variance64x32,
NULL,
NULL,
- vp9_highbd_sad64x32x4d_bits10)
+ vpx_highbd_sad64x32x4d_bits10)
HIGHBD_BFP(BLOCK_32X64,
- vp9_highbd_sad32x64_bits10,
- vp9_highbd_sad32x64_avg_bits10,
+ vpx_highbd_sad32x64_bits10,
+ vpx_highbd_sad32x64_avg_bits10,
vp9_highbd_10_variance32x64,
vp9_highbd_10_sub_pixel_variance32x64,
vp9_highbd_10_sub_pixel_avg_variance32x64,
NULL,
NULL,
- vp9_highbd_sad32x64x4d_bits10)
+ vpx_highbd_sad32x64x4d_bits10)
HIGHBD_BFP(BLOCK_32X32,
- vp9_highbd_sad32x32_bits10,
- vp9_highbd_sad32x32_avg_bits10,
+ vpx_highbd_sad32x32_bits10,
+ vpx_highbd_sad32x32_avg_bits10,
vp9_highbd_10_variance32x32,
vp9_highbd_10_sub_pixel_variance32x32,
vp9_highbd_10_sub_pixel_avg_variance32x32,
- vp9_highbd_sad32x32x3_bits10,
- vp9_highbd_sad32x32x8_bits10,
- vp9_highbd_sad32x32x4d_bits10)
+ vpx_highbd_sad32x32x3_bits10,
+ vpx_highbd_sad32x32x8_bits10,
+ vpx_highbd_sad32x32x4d_bits10)
HIGHBD_BFP(BLOCK_64X64,
- vp9_highbd_sad64x64_bits10,
- vp9_highbd_sad64x64_avg_bits10,
+ vpx_highbd_sad64x64_bits10,
+ vpx_highbd_sad64x64_avg_bits10,
vp9_highbd_10_variance64x64,
vp9_highbd_10_sub_pixel_variance64x64,
vp9_highbd_10_sub_pixel_avg_variance64x64,
- vp9_highbd_sad64x64x3_bits10,
- vp9_highbd_sad64x64x8_bits10,
- vp9_highbd_sad64x64x4d_bits10)
+ vpx_highbd_sad64x64x3_bits10,
+ vpx_highbd_sad64x64x8_bits10,
+ vpx_highbd_sad64x64x4d_bits10)
HIGHBD_BFP(BLOCK_16X16,
- vp9_highbd_sad16x16_bits10,
- vp9_highbd_sad16x16_avg_bits10,
+ vpx_highbd_sad16x16_bits10,
+ vpx_highbd_sad16x16_avg_bits10,
vp9_highbd_10_variance16x16,
vp9_highbd_10_sub_pixel_variance16x16,
vp9_highbd_10_sub_pixel_avg_variance16x16,
- vp9_highbd_sad16x16x3_bits10,
- vp9_highbd_sad16x16x8_bits10,
- vp9_highbd_sad16x16x4d_bits10)
+ vpx_highbd_sad16x16x3_bits10,
+ vpx_highbd_sad16x16x8_bits10,
+ vpx_highbd_sad16x16x4d_bits10)
HIGHBD_BFP(BLOCK_16X8,
- vp9_highbd_sad16x8_bits10,
- vp9_highbd_sad16x8_avg_bits10,
+ vpx_highbd_sad16x8_bits10,
+ vpx_highbd_sad16x8_avg_bits10,
vp9_highbd_10_variance16x8,
vp9_highbd_10_sub_pixel_variance16x8,
vp9_highbd_10_sub_pixel_avg_variance16x8,
- vp9_highbd_sad16x8x3_bits10,
- vp9_highbd_sad16x8x8_bits10,
- vp9_highbd_sad16x8x4d_bits10)
+ vpx_highbd_sad16x8x3_bits10,
+ vpx_highbd_sad16x8x8_bits10,
+ vpx_highbd_sad16x8x4d_bits10)
HIGHBD_BFP(BLOCK_8X16,
- vp9_highbd_sad8x16_bits10,
- vp9_highbd_sad8x16_avg_bits10,
+ vpx_highbd_sad8x16_bits10,
+ vpx_highbd_sad8x16_avg_bits10,
vp9_highbd_10_variance8x16,
vp9_highbd_10_sub_pixel_variance8x16,
vp9_highbd_10_sub_pixel_avg_variance8x16,
- vp9_highbd_sad8x16x3_bits10,
- vp9_highbd_sad8x16x8_bits10,
- vp9_highbd_sad8x16x4d_bits10)
+ vpx_highbd_sad8x16x3_bits10,
+ vpx_highbd_sad8x16x8_bits10,
+ vpx_highbd_sad8x16x4d_bits10)
HIGHBD_BFP(BLOCK_8X8,
- vp9_highbd_sad8x8_bits10,
- vp9_highbd_sad8x8_avg_bits10,
+ vpx_highbd_sad8x8_bits10,
+ vpx_highbd_sad8x8_avg_bits10,
vp9_highbd_10_variance8x8,
vp9_highbd_10_sub_pixel_variance8x8,
vp9_highbd_10_sub_pixel_avg_variance8x8,
- vp9_highbd_sad8x8x3_bits10,
- vp9_highbd_sad8x8x8_bits10,
- vp9_highbd_sad8x8x4d_bits10)
+ vpx_highbd_sad8x8x3_bits10,
+ vpx_highbd_sad8x8x8_bits10,
+ vpx_highbd_sad8x8x4d_bits10)
HIGHBD_BFP(BLOCK_8X4,
- vp9_highbd_sad8x4_bits10,
- vp9_highbd_sad8x4_avg_bits10,
+ vpx_highbd_sad8x4_bits10,
+ vpx_highbd_sad8x4_avg_bits10,
vp9_highbd_10_variance8x4,
vp9_highbd_10_sub_pixel_variance8x4,
vp9_highbd_10_sub_pixel_avg_variance8x4,
NULL,
- vp9_highbd_sad8x4x8_bits10,
- vp9_highbd_sad8x4x4d_bits10)
+ vpx_highbd_sad8x4x8_bits10,
+ vpx_highbd_sad8x4x4d_bits10)
HIGHBD_BFP(BLOCK_4X8,
- vp9_highbd_sad4x8_bits10,
- vp9_highbd_sad4x8_avg_bits10,
+ vpx_highbd_sad4x8_bits10,
+ vpx_highbd_sad4x8_avg_bits10,
vp9_highbd_10_variance4x8,
vp9_highbd_10_sub_pixel_variance4x8,
vp9_highbd_10_sub_pixel_avg_variance4x8,
NULL,
- vp9_highbd_sad4x8x8_bits10,
- vp9_highbd_sad4x8x4d_bits10)
+ vpx_highbd_sad4x8x8_bits10,
+ vpx_highbd_sad4x8x4d_bits10)
HIGHBD_BFP(BLOCK_4X4,
- vp9_highbd_sad4x4_bits10,
- vp9_highbd_sad4x4_avg_bits10,
+ vpx_highbd_sad4x4_bits10,
+ vpx_highbd_sad4x4_avg_bits10,
vp9_highbd_10_variance4x4,
vp9_highbd_10_sub_pixel_variance4x4,
vp9_highbd_10_sub_pixel_avg_variance4x4,
- vp9_highbd_sad4x4x3_bits10,
- vp9_highbd_sad4x4x8_bits10,
- vp9_highbd_sad4x4x4d_bits10)
+ vpx_highbd_sad4x4x3_bits10,
+ vpx_highbd_sad4x4x8_bits10,
+ vpx_highbd_sad4x4x4d_bits10)
break;
case VPX_BITS_12:
HIGHBD_BFP(BLOCK_32X16,
- vp9_highbd_sad32x16_bits12,
- vp9_highbd_sad32x16_avg_bits12,
+ vpx_highbd_sad32x16_bits12,
+ vpx_highbd_sad32x16_avg_bits12,
vp9_highbd_12_variance32x16,
vp9_highbd_12_sub_pixel_variance32x16,
vp9_highbd_12_sub_pixel_avg_variance32x16,
NULL,
NULL,
- vp9_highbd_sad32x16x4d_bits12)
+ vpx_highbd_sad32x16x4d_bits12)
HIGHBD_BFP(BLOCK_16X32,
- vp9_highbd_sad16x32_bits12,
- vp9_highbd_sad16x32_avg_bits12,
+ vpx_highbd_sad16x32_bits12,
+ vpx_highbd_sad16x32_avg_bits12,
vp9_highbd_12_variance16x32,
vp9_highbd_12_sub_pixel_variance16x32,
vp9_highbd_12_sub_pixel_avg_variance16x32,
NULL,
NULL,
- vp9_highbd_sad16x32x4d_bits12)
+ vpx_highbd_sad16x32x4d_bits12)
HIGHBD_BFP(BLOCK_64X32,
- vp9_highbd_sad64x32_bits12,
- vp9_highbd_sad64x32_avg_bits12,
+ vpx_highbd_sad64x32_bits12,
+ vpx_highbd_sad64x32_avg_bits12,
vp9_highbd_12_variance64x32,
vp9_highbd_12_sub_pixel_variance64x32,
vp9_highbd_12_sub_pixel_avg_variance64x32,
NULL,
NULL,
- vp9_highbd_sad64x32x4d_bits12)
+ vpx_highbd_sad64x32x4d_bits12)
HIGHBD_BFP(BLOCK_32X64,
- vp9_highbd_sad32x64_bits12,
- vp9_highbd_sad32x64_avg_bits12,
+ vpx_highbd_sad32x64_bits12,
+ vpx_highbd_sad32x64_avg_bits12,
vp9_highbd_12_variance32x64,
vp9_highbd_12_sub_pixel_variance32x64,
vp9_highbd_12_sub_pixel_avg_variance32x64,
NULL,
NULL,
- vp9_highbd_sad32x64x4d_bits12)
+ vpx_highbd_sad32x64x4d_bits12)
HIGHBD_BFP(BLOCK_32X32,
- vp9_highbd_sad32x32_bits12,
- vp9_highbd_sad32x32_avg_bits12,
+ vpx_highbd_sad32x32_bits12,
+ vpx_highbd_sad32x32_avg_bits12,
vp9_highbd_12_variance32x32,
vp9_highbd_12_sub_pixel_variance32x32,
vp9_highbd_12_sub_pixel_avg_variance32x32,
- vp9_highbd_sad32x32x3_bits12,
- vp9_highbd_sad32x32x8_bits12,
- vp9_highbd_sad32x32x4d_bits12)
+ vpx_highbd_sad32x32x3_bits12,
+ vpx_highbd_sad32x32x8_bits12,
+ vpx_highbd_sad32x32x4d_bits12)
HIGHBD_BFP(BLOCK_64X64,
- vp9_highbd_sad64x64_bits12,
- vp9_highbd_sad64x64_avg_bits12,
+ vpx_highbd_sad64x64_bits12,
+ vpx_highbd_sad64x64_avg_bits12,
vp9_highbd_12_variance64x64,
vp9_highbd_12_sub_pixel_variance64x64,
vp9_highbd_12_sub_pixel_avg_variance64x64,
- vp9_highbd_sad64x64x3_bits12,
- vp9_highbd_sad64x64x8_bits12,
- vp9_highbd_sad64x64x4d_bits12)
+ vpx_highbd_sad64x64x3_bits12,
+ vpx_highbd_sad64x64x8_bits12,
+ vpx_highbd_sad64x64x4d_bits12)
HIGHBD_BFP(BLOCK_16X16,
- vp9_highbd_sad16x16_bits12,
- vp9_highbd_sad16x16_avg_bits12,
+ vpx_highbd_sad16x16_bits12,
+ vpx_highbd_sad16x16_avg_bits12,
vp9_highbd_12_variance16x16,
vp9_highbd_12_sub_pixel_variance16x16,
vp9_highbd_12_sub_pixel_avg_variance16x16,
- vp9_highbd_sad16x16x3_bits12,
- vp9_highbd_sad16x16x8_bits12,
- vp9_highbd_sad16x16x4d_bits12)
+ vpx_highbd_sad16x16x3_bits12,
+ vpx_highbd_sad16x16x8_bits12,
+ vpx_highbd_sad16x16x4d_bits12)
HIGHBD_BFP(BLOCK_16X8,
- vp9_highbd_sad16x8_bits12,
- vp9_highbd_sad16x8_avg_bits12,
+ vpx_highbd_sad16x8_bits12,
+ vpx_highbd_sad16x8_avg_bits12,
vp9_highbd_12_variance16x8,
vp9_highbd_12_sub_pixel_variance16x8,
vp9_highbd_12_sub_pixel_avg_variance16x8,
- vp9_highbd_sad16x8x3_bits12,
- vp9_highbd_sad16x8x8_bits12,
- vp9_highbd_sad16x8x4d_bits12)
+ vpx_highbd_sad16x8x3_bits12,
+ vpx_highbd_sad16x8x8_bits12,
+ vpx_highbd_sad16x8x4d_bits12)
HIGHBD_BFP(BLOCK_8X16,
- vp9_highbd_sad8x16_bits12,
- vp9_highbd_sad8x16_avg_bits12,
+ vpx_highbd_sad8x16_bits12,
+ vpx_highbd_sad8x16_avg_bits12,
vp9_highbd_12_variance8x16,
vp9_highbd_12_sub_pixel_variance8x16,
vp9_highbd_12_sub_pixel_avg_variance8x16,
- vp9_highbd_sad8x16x3_bits12,
- vp9_highbd_sad8x16x8_bits12,
- vp9_highbd_sad8x16x4d_bits12)
+ vpx_highbd_sad8x16x3_bits12,
+ vpx_highbd_sad8x16x8_bits12,
+ vpx_highbd_sad8x16x4d_bits12)
HIGHBD_BFP(BLOCK_8X8,
- vp9_highbd_sad8x8_bits12,
- vp9_highbd_sad8x8_avg_bits12,
+ vpx_highbd_sad8x8_bits12,
+ vpx_highbd_sad8x8_avg_bits12,
vp9_highbd_12_variance8x8,
vp9_highbd_12_sub_pixel_variance8x8,
vp9_highbd_12_sub_pixel_avg_variance8x8,
- vp9_highbd_sad8x8x3_bits12,
- vp9_highbd_sad8x8x8_bits12,
- vp9_highbd_sad8x8x4d_bits12)
+ vpx_highbd_sad8x8x3_bits12,
+ vpx_highbd_sad8x8x8_bits12,
+ vpx_highbd_sad8x8x4d_bits12)
HIGHBD_BFP(BLOCK_8X4,
- vp9_highbd_sad8x4_bits12,
- vp9_highbd_sad8x4_avg_bits12,
+ vpx_highbd_sad8x4_bits12,
+ vpx_highbd_sad8x4_avg_bits12,
vp9_highbd_12_variance8x4,
vp9_highbd_12_sub_pixel_variance8x4,
vp9_highbd_12_sub_pixel_avg_variance8x4,
NULL,
- vp9_highbd_sad8x4x8_bits12,
- vp9_highbd_sad8x4x4d_bits12)
+ vpx_highbd_sad8x4x8_bits12,
+ vpx_highbd_sad8x4x4d_bits12)
HIGHBD_BFP(BLOCK_4X8,
- vp9_highbd_sad4x8_bits12,
- vp9_highbd_sad4x8_avg_bits12,
+ vpx_highbd_sad4x8_bits12,
+ vpx_highbd_sad4x8_avg_bits12,
vp9_highbd_12_variance4x8,
vp9_highbd_12_sub_pixel_variance4x8,
vp9_highbd_12_sub_pixel_avg_variance4x8,
NULL,
- vp9_highbd_sad4x8x8_bits12,
- vp9_highbd_sad4x8x4d_bits12)
+ vpx_highbd_sad4x8x8_bits12,
+ vpx_highbd_sad4x8x4d_bits12)
HIGHBD_BFP(BLOCK_4X4,
- vp9_highbd_sad4x4_bits12,
- vp9_highbd_sad4x4_avg_bits12,
+ vpx_highbd_sad4x4_bits12,
+ vpx_highbd_sad4x4_avg_bits12,
vp9_highbd_12_variance4x4,
vp9_highbd_12_sub_pixel_variance4x4,
vp9_highbd_12_sub_pixel_avg_variance4x4,
- vp9_highbd_sad4x4x3_bits12,
- vp9_highbd_sad4x4x8_bits12,
- vp9_highbd_sad4x4x4d_bits12)
+ vpx_highbd_sad4x4x3_bits12,
+ vpx_highbd_sad4x4x8_bits12,
+ vpx_highbd_sad4x4x4d_bits12)
break;
default:
@@ -1210,6 +1395,32 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+static void realloc_segmentation_maps(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ // Create the encoder segmentation map and set all entries to 0
+ vpx_free(cpi->segmentation_map);
+ CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+ // Create a map used for cyclic background refresh.
+ if (cpi->cyclic_refresh)
+ vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+ CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+ vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+ // Create a map used to mark inactive areas.
+ vpx_free(cpi->active_map.map);
+ CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+ // And a place holder structure is the coding context
+ // for use if we want to save and restore it
+ vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+ CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
@@ -1217,6 +1428,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
if (cm->profile != oxcf->profile)
cm->profile = oxcf->profile;
cm->bit_depth = oxcf->bit_depth;
+ cm->color_space = oxcf->color_space;
if (cm->profile <= PROFILE_1)
assert(cm->bit_depth == VPX_BITS_8);
@@ -1225,7 +1437,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cpi->oxcf = *oxcf;
#if CONFIG_VP9_HIGHBITDEPTH
- cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+ cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
#endif // CONFIG_VP9_HIGHBITDEPTH
rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1264,13 +1476,16 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cm->display_width = cpi->oxcf.width;
cm->display_height = cpi->oxcf.height;
+ cm->width = cpi->oxcf.width;
+ cm->height = cpi->oxcf.height;
if (cpi->initial_width) {
- // Increasing the size of the frame beyond the first seen frame, or some
- // otherwise signaled maximum size, is not supported.
- // TODO(jkoleszar): exit gracefully.
- assert(cm->width <= cpi->initial_width);
- assert(cm->height <= cpi->initial_height);
+ if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+ vp9_free_context_buffers(cm);
+ vp9_alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ }
}
update_frame_size(cpi);
@@ -1278,7 +1493,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cpi->oxcf.rc_mode == VPX_CBR) ||
((cpi->svc.number_temporal_layers > 1 ||
cpi->svc.number_spatial_layers > 1) &&
- cpi->oxcf.pass == 2)) {
+ cpi->oxcf.pass != 1)) {
vp9_update_layer_context_change_config(cpi,
(int)cpi->oxcf.target_bandwidth);
}
@@ -1300,17 +1515,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
#endif
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif
- VP9_ENC_BORDER_IN_PIXELS);
- }
-#endif
}
#ifndef M_LOG2_E
@@ -1356,10 +1560,11 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
}
-VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
- unsigned int i, j;
- VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
- VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+ BufferPool *const pool) {
+ unsigned int i;
+ VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+ VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
if (!cm)
return NULL;
@@ -1373,31 +1578,27 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
}
cm->error.setjmp = 1;
+ cm->alloc_mi = vp9_enc_alloc_mi;
+ cm->free_mi = vp9_enc_free_mi;
+ cm->setup_mi = vp9_enc_setup_mi;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(cm, cm->frame_contexts,
+ (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+ sizeof(*cm->frame_contexts)));
cpi->use_svc = 0;
+ cpi->common.buffer_pool = pool;
init_config(cpi, oxcf);
vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
cm->current_video_frame = 0;
cpi->partition_search_skippable_frame = 0;
+ cpi->tile_data = NULL;
- // Create the encoder segmentation map and set all entries to 0
- CHECK_MEM_ERROR(cm, cpi->segmentation_map,
- vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
- // Create a complexity map used for rd adjustment
- CHECK_MEM_ERROR(cm, cpi->complexity_map,
- vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
- // Create a map used for cyclic background refresh.
- CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
- vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
-
- // And a place holder structure is the coding context
- // for use if we want to save and restore it
- CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
- vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+ realloc_segmentation_maps(cpi);
CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
@@ -1435,45 +1636,24 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
#endif
cpi->refresh_alt_ref_frame = 0;
-
- // Note that at the moment multi_arf will not work with svc.
- // For the current check in all the execution paths are defaulted to 0
- // pending further tuning and testing. The code is left in place here
- // as a place holder in regard to the required paths.
cpi->multi_arf_last_grp_enabled = 0;
- if (oxcf->pass == 2) {
- if (cpi->use_svc) {
- cpi->multi_arf_allowed = 0;
- cpi->multi_arf_enabled = 0;
- } else {
- // Disable by default for now.
- cpi->multi_arf_allowed = 0;
- cpi->multi_arf_enabled = 0;
- }
- } else {
- cpi->multi_arf_allowed = 0;
- cpi->multi_arf_enabled = 0;
- }
cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
#if CONFIG_INTERNAL_STATS
cpi->b_calculate_ssimg = 0;
+ cpi->b_calculate_blockiness = 1;
+ cpi->b_calculate_consistency = 1;
+ cpi->total_inconsistency = 0;
+ cpi->psnr.worst = 100.0;
+ cpi->worst_ssim = 100.0;
cpi->count = 0;
cpi->bytes = 0;
if (cpi->b_calculate_psnr) {
- cpi->total_y = 0.0;
- cpi->total_u = 0.0;
- cpi->total_v = 0.0;
- cpi->total = 0.0;
cpi->total_sq_error = 0;
cpi->total_samples = 0;
- cpi->totalp_y = 0.0;
- cpi->totalp_u = 0.0;
- cpi->totalp_v = 0.0;
- cpi->totalp = 0.0;
cpi->totalp_sq_error = 0;
cpi->totalp_samples = 0;
@@ -1485,34 +1665,47 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
}
if (cpi->b_calculate_ssimg) {
- cpi->total_ssimg_y = 0;
- cpi->total_ssimg_u = 0;
- cpi->total_ssimg_v = 0;
- cpi->total_ssimg_all = 0;
+ cpi->ssimg.worst= 100.0;
+ }
+ cpi->fastssim.worst = 100.0;
+
+ cpi->psnrhvs.worst = 100.0;
+
+ if (cpi->b_calculate_blockiness) {
+ cpi->total_blockiness = 0;
+ cpi->worst_blockiness = 0.0;
+ }
+
+ if (cpi->b_calculate_consistency) {
+ cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars)*720*480);
+ cpi->worst_consistency = 100.0;
}
#endif
cpi->first_time_stamp_ever = INT64_MAX;
- cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
- cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
- cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
- cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
- cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
- cal_nmvsadcosts(cpi->mb.nmvsadcost);
+ cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+ cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+ cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+ cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+ cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+ cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
- cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
- cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
- cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
- cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
- cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+ cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+ cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+ cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+ cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+ cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
#if CONFIG_VP9_TEMPORAL_DENOISING
#ifdef OUTPUT_YUV_DENOISED
yuv_denoised_file = fopen("denoised.yuv", "ab");
#endif
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
#ifdef OUTPUT_YUV_REC
yuv_rec_file = fopen("rec.yuv", "wb");
#endif
@@ -1589,7 +1782,8 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
}
}
- vp9_set_speed_features(cpi);
+ vp9_set_speed_features_framesize_independent(cpi);
+ vp9_set_speed_features_framesize_dependent(cpi);
// Allocate memory to store variances for a frame.
CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -1597,14 +1791,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
- // Default rd threshold factors for mode selection
- for (i = 0; i < BLOCK_SIZES; ++i) {
- for (j = 0; j < MAX_MODES; ++j) {
- cpi->rd.thresh_freq_fact[i][j] = 32;
- cpi->rd.mode_map[i][j] = j;
- }
- }
-
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].sdaf = SDAF; \
@@ -1615,64 +1801,64 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
cpi->fn_ptr[BT].sdx8f = SDX8F; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
- BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+ BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
vp9_variance32x16, vp9_sub_pixel_variance32x16,
- vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
+ vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
- BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+ BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
vp9_variance16x32, vp9_sub_pixel_variance16x32,
- vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
+ vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
- BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+ BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
vp9_variance64x32, vp9_sub_pixel_variance64x32,
- vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
+ vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
- BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+ BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
vp9_variance32x64, vp9_sub_pixel_variance32x64,
- vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
+ vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
- BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+ BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
vp9_variance32x32, vp9_sub_pixel_variance32x32,
- vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
- vp9_sad32x32x4d)
+ vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+ vpx_sad32x32x4d)
- BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+ BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
vp9_variance64x64, vp9_sub_pixel_variance64x64,
- vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
- vp9_sad64x64x4d)
+ vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+ vpx_sad64x64x4d)
- BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+ BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
vp9_variance16x16, vp9_sub_pixel_variance16x16,
- vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
- vp9_sad16x16x4d)
+ vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+ vpx_sad16x16x4d)
- BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+ BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
vp9_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8,
- vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+ vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
- BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+ BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
vp9_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16,
- vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+ vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
- BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+ BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
vp9_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8,
- vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+ vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
- BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+ BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
vp9_variance8x4, vp9_sub_pixel_variance8x4,
- vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
+ vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
- BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+ BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
vp9_variance4x8, vp9_sub_pixel_variance4x8,
- vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
+ vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
- BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+ BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
vp9_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4,
- vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+ vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
@@ -1691,20 +1877,27 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
return cpi;
}
+#define SNPRINT(H, T) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
void vp9_remove_compressor(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
unsigned int i;
+ int t;
if (!cpi)
return;
- if (cpi && (cpi->common.current_video_frame > 0)) {
+ if (cpi && (cm->current_video_frame > 0)) {
#if CONFIG_INTERNAL_STATS
-
vp9_clear_system_state();
- // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
if (cpi->oxcf.pass != 1) {
+ char headings[512] = {0};
+ char results[512] = {0};
FILE *f = fopen("opsnr.stt", "a");
double time_encoded = (cpi->last_end_time_stamp_seen
- cpi->first_time_stamp_ever) / 10000000.000;
@@ -1722,25 +1915,50 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
(double)cpi->totalp_sq_error);
const double total_ssim = 100 * pow(cpi->summed_quality /
- cpi->summed_weights, 8.0);
+ cpi->summed_weights, 8.0);
const double totalp_ssim = 100 * pow(cpi->summedp_quality /
- cpi->summedp_weights, 8.0);
-
- fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
- "VPXSSIM\tVPSSIMP\t Time(ms)\n");
- fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
- dr, cpi->total / cpi->count, total_psnr,
- cpi->totalp / cpi->count, totalp_psnr, total_ssim, totalp_ssim,
- total_encode_time);
- }
+ cpi->summedp_weights, 8.0);
+
+ snprintf(headings, sizeof(headings),
+ "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+ "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+ snprintf(results, sizeof(results),
+ "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+ dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+ cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr,
+ total_ssim, totalp_ssim,
+ cpi->fastssim.stat[ALL] / cpi->count,
+ cpi->psnrhvs.stat[ALL] / cpi->count,
+ cpi->psnr.worst, cpi->worst_ssim, cpi->fastssim.worst,
+ cpi->psnrhvs.worst);
+
+ if (cpi->b_calculate_blockiness) {
+ SNPRINT(headings, "\t Block\tWstBlck");
+ SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+ }
- if (cpi->b_calculate_ssimg) {
- fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n");
- fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
- cpi->total_ssimg_y / cpi->count,
- cpi->total_ssimg_u / cpi->count,
- cpi->total_ssimg_v / cpi->count,
- cpi->total_ssimg_all / cpi->count, total_encode_time);
+ if (cpi->b_calculate_consistency) {
+ double consistency =
+ vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+ (double)cpi->total_inconsistency);
+
+ SNPRINT(headings, "\tConsist\tWstCons");
+ SNPRINT2(results, "\t%7.3f", consistency);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+ }
+
+ if (cpi->b_calculate_ssimg) {
+ SNPRINT(headings, "\t SSIMG\tWtSSIMG");
+ SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
+ SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
+ }
+
+ fprintf(f, "%s\t Time\n", headings);
+ fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
}
fclose(f);
@@ -1761,13 +1979,30 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_free(&(cpi->denoiser));
- }
+ vp9_denoiser_free(&(cpi->denoiser));
#endif
+ for (t = 0; t < cpi->num_workers; ++t) {
+ VP9Worker *const worker = &cpi->workers[t];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+ // Deallocate allocated threads.
+ vp9_get_worker_interface()->end(worker);
+
+ // Deallocate allocated thread data.
+ if (t < cpi->num_workers - 1) {
+ vpx_free(thread_data->td->counts);
+ vp9_free_pc_tree(thread_data->td);
+ vpx_free(thread_data->td);
+ }
+ }
+ vpx_free(cpi->tile_thr_data);
+ vpx_free(cpi->workers);
+
+ if (cpi->num_workers > 1)
+ vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+
dealloc_compressor_data(cpi);
- vpx_free(cpi->tok);
for (i = 0; i < sizeof(cpi->mbgraph_stats) /
sizeof(cpi->mbgraph_stats[0]); ++i) {
@@ -1781,7 +2016,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
#endif
- vp9_remove_common(&cpi->common);
+ vp9_remove_common(cm);
+ vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(cm);
+#endif
vpx_free(cpi);
#if CONFIG_VP9_TEMPORAL_DENOISING
@@ -1789,6 +2028,9 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
fclose(yuv_denoised_file);
#endif
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ fclose(yuv_skinmap_file);
+#endif
#ifdef OUTPUT_YUV_REC
fclose(yuv_rec_file);
#endif
@@ -1914,11 +2156,13 @@ typedef struct {
static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
PSNR_STATS *psnr) {
static const double peak = 255.0;
- const int widths[3] = {a->y_width, a->uv_width, a->uv_width };
- const int heights[3] = {a->y_height, a->uv_height, a->uv_height};
- const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer };
+ const int widths[3] = {
+ a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+ const int heights[3] = {
+ a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+ const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
- const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer };
+ const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
int i;
uint64_t total_sse = 0;
@@ -1951,8 +2195,10 @@ static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
PSNR_STATS *psnr,
unsigned int bit_depth,
unsigned int in_bit_depth) {
- const int widths[3] = {a->y_width, a->uv_width, a->uv_width };
- const int heights[3] = {a->y_height, a->uv_height, a->uv_height};
+ const int widths[3] =
+ {a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] =
+ {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer };
const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer };
@@ -2003,7 +2249,7 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
PSNR_STATS psnr;
#if CONFIG_VP9_HIGHBITDEPTH
calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
- cpi->mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+ cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
#else
calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
#endif
@@ -2076,8 +2322,7 @@ int vp9_update_entropy(VP9_COMP * cpi, int update) {
return 0;
}
-#if CONFIG_VP9_TEMPORAL_DENOISING
-#if defined(OUTPUT_YUV_DENOISED)
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
// not denoise the UV channels at this time. If ever we implement UV channel
@@ -2092,23 +2337,22 @@ void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
} while (--h);
src = s->u_buffer;
- h = s->uv_height / 2;
+ h = s->uv_height;
do {
- fwrite(src, s->uv_width / 2, 1, f);
- src += s->uv_stride + s->uv_width / 2;
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
} while (--h);
src = s->v_buffer;
- h = s->uv_height / 2;
+ h = s->uv_height;
do {
- fwrite(src, s->uv_width / 2, 1, f);
- src += s->uv_stride + s->uv_width / 2;
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
} while (--h);
}
#endif
-#endif
#ifdef OUTPUT_YUV_REC
void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
@@ -2267,28 +2511,43 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
vp9_extend_frame_borders(dst);
}
+static int scale_down(VP9_COMP *cpi, int q) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int scale = 0;
+ assert(frame_is_kf_gf_arf(cpi));
+
+ if (rc->frame_size_selector == UNSCALED &&
+ q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+ const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
+ * MAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+ scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+ }
+ return scale;
+}
+
// Function to test for conditions that indicate we should loop
// back and recode a frame.
-static int recode_loop_test(const VP9_COMP *cpi,
+static int recode_loop_test(VP9_COMP *cpi,
int high_limit, int low_limit,
int q, int maxq, int minq) {
- const VP9_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
int force_recode = 0;
- // Special case trap if maximum allowed frame size exceeded.
- if (rc->projected_frame_size > rc->max_frame_bandwidth) {
- force_recode = 1;
-
- // Is frame recode allowed.
- // Yes if either recode mode 1 is selected or mode 2 is selected
- // and the frame is a key frame, golden frame or alt_ref_frame
- } else if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
- ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) &&
- (cm->frame_type == KEY_FRAME ||
- cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
- // General over and under shoot tests
+ if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
+ (frame_is_kfgfarf &&
+ (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+ if (frame_is_kfgfarf &&
+ (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+ scale_down(cpi, q)) {
+ // Code this group at a lower resolution.
+ cpi->resize_pending = 1;
+ return 1;
+ }
+
+ // TODO(agrange) high_limit could be greater than the scale-down threshold.
if ((rc->projected_frame_size > high_limit && q < maxq) ||
(rc->projected_frame_size < low_limit && q > minq)) {
force_recode = 1;
@@ -2306,13 +2565,14 @@ static int recode_loop_test(const VP9_COMP *cpi,
void vp9_update_reference_frames(VP9_COMP *cpi) {
VP9_COMMON * const cm = &cpi->common;
+ BufferPool *const pool = cm->buffer_pool;
// At this point the new frame has been encoded.
// If any buffer copy / swapping is signaled it should be done here.
if (cm->frame_type == KEY_FRAME) {
- ref_cnt_fb(cm->frame_bufs,
+ ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
- ref_cnt_fb(cm->frame_bufs,
+ ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
} else if (vp9_preserve_existing_gf(cpi)) {
// We have decided to preserve the previously existing golden frame as our
@@ -2325,7 +2585,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
// slot and, if we're updating the GF, the current frame becomes the new GF.
int tmp;
- ref_cnt_fb(cm->frame_bufs,
+ ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
tmp = cpi->alt_fb_idx;
@@ -2344,34 +2604,34 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
arf_idx = gf_group->arf_update_idx[gf_group->index];
}
- ref_cnt_fb(cm->frame_bufs,
+ ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[arf_idx], cm->new_fb_idx);
- vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
+ memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
if (cpi->refresh_golden_frame) {
- ref_cnt_fb(cm->frame_bufs,
+ ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
if (!cpi->rc.is_src_frame_alt_ref)
- vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
+ memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
else
- vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
- cpi->interp_filter_selected[ALTREF_FRAME],
- sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
+ memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[ALTREF_FRAME],
+ sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
}
}
if (cpi->refresh_last_frame) {
- ref_cnt_fb(cm->frame_bufs,
+ ref_cnt_fb(pool->frame_bufs,
&cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
if (!cpi->rc.is_src_frame_alt_ref)
- vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
@@ -2386,7 +2646,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
}
static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
struct loopfilter *lf = &cm->lf;
if (xd->lossless) {
lf->filter_level = 0;
@@ -2404,42 +2664,83 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
}
if (lf->filter_level > 0) {
- vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+ if (cpi->num_workers > 1)
+ vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+ lf->filter_level, 0, 0,
+ cpi->workers, cpi->num_workers,
+ &cpi->lf_row_sync);
+ else
+ vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
}
vp9_extend_frame_inner_borders(cm->frame_to_show);
}
+static INLINE void alloc_frame_mvs(const VP9_COMMON *cm,
+ int buffer_idx) {
+ RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+ if (new_fb_ptr->mvs == NULL ||
+ new_fb_ptr->mi_rows < cm->mi_rows ||
+ new_fb_ptr->mi_cols < cm->mi_cols) {
+ vpx_free(new_fb_ptr->mvs);
+ new_fb_ptr->mvs =
+ (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(*new_fb_ptr->mvs));
+ new_fb_ptr->mi_rows = cm->mi_rows;
+ new_fb_ptr->mi_cols = cm->mi_cols;
+ }
+}
+
void vp9_scale_references(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
MV_REFERENCE_FRAME ref_frame;
const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
- const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
-
// Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
- if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
- (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
- const int new_fb = get_free_fb(cm);
- vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
- cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif // CONFIG_VP9_HIGHBITDEPTH
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+ if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+ BufferPool *const pool = cm->buffer_pool;
+ const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+ ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ continue;
+ }
+
#if CONFIG_VP9_HIGHBITDEPTH
- scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
- (int)cm->bit_depth);
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ const int new_fb = get_free_fb(cm);
+ RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+ cm->cur_frame = &pool->frame_bufs[new_fb];
+ vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf,
+ cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+ cm->use_highbitdepth,
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL);
+ scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
#else
- scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ const int new_fb = get_free_fb(cm);
+ RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+ vp9_realloc_frame_buffer(&new_fb_ptr->buf,
+ cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL);
+ scale_and_extend_frame(ref, &new_fb_ptr->buf);
#endif // CONFIG_VP9_HIGHBITDEPTH
- cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+
+ alloc_frame_mvs(cm, new_fb);
+ } else {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+ ++pool->frame_bufs[buf_idx].ref_count;
+ }
} else {
- cpi->scaled_ref_idx[ref_frame - 1] = idx;
- cm->frame_bufs[idx].ref_count++;
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
}
}
}
@@ -2447,9 +2748,15 @@ void vp9_scale_references(VP9_COMP *cpi) {
static void release_scaled_references(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
int i;
-
- for (i = 0; i < 3; i++)
- cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
+ for (i = 0; i < MAX_REF_FRAMES; ++i) {
+ const int idx = cpi->scaled_ref_idx[i];
+ RefCntBuffer *const buf = idx != INVALID_IDX ?
+ &cm->buffer_pool->frame_bufs[idx] : NULL;
+ if (buf != NULL) {
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+ }
}
static void full_to_model_count(unsigned int *model_count,
@@ -2478,20 +2785,22 @@ static void full_to_model_counts(vp9_coeff_count_model *model_count,
static void output_frame_level_debug_stats(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
- int recon_err;
+ int64_t recon_err;
vp9_clear_system_state();
recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10u %10d %10d %10d %10d"
+ fprintf(f, "%10u %dx%d %10d %10d %10d %10d"
"%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d "
"%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
"%6d %6d %5d %5d %5d "
"%10"PRId64" %10.3lf"
- "%10lf %8u %10d %10d %10d\n",
- cpi->common.current_video_frame, cpi->rc.this_frame_target,
+ "%10lf %8u %10"PRId64" %10d %10d\n",
+ cpi->common.current_video_frame,
+ cm->width, cm->height,
+ cpi->rc.this_frame_target,
cpi->rc.projected_frame_size,
cpi->rc.projected_frame_size / cpi->common.MBs,
(cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
@@ -2534,12 +2843,194 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
}
#endif
-static void encode_without_recode_loop(VP9_COMP *cpi,
- int q) {
+static void set_mv_search_params(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+ // Default based on max resolution.
+ cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame) {
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param =
+ vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ }
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+}
+
+static void set_size_independent_vars(VP9_COMP *cpi) {
+ vp9_set_speed_features_framesize_independent(cpi);
+ vp9_set_rd_speed_thresholds(cpi);
+ vp9_set_rd_speed_thresholds_sub8x8(cpi);
+ cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
+ int *bottom_index, int *top_index) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Setup variables that depend on the dimensions of the frame.
+ vp9_set_speed_features_framesize_dependent(cpi);
+
+ // Decide q and q bounds.
+ *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+ if (!frame_is_intra_only(cm)) {
+ vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+ }
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in the second pass of a two pass encode, as it requires
+ // lagged coding, and if the relevant speed feature flag is set.
+ if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+ configure_static_seg_features(cpi);
+
+#if CONFIG_VP9_POSTPROC
+ if (oxcf->noise_sensitivity > 0) {
+ int l = 0;
+ switch (oxcf->noise_sensitivity) {
+ case 1:
+ l = 20;
+ break;
+ case 2:
+ l = 40;
+ break;
+ case 3:
+ l = 60;
+ break;
+ case 4:
+ case 5:
+ l = 100;
+ break;
+ case 6:
+ l = 150;
+ break;
+ }
+ vp9_denoise(cpi->Source, cpi->Source, l);
+ }
+#endif // CONFIG_VP9_POSTPROC
+}
+
+static void init_motion_estimation(VP9_COMP *cpi) {
+ int y_stride = cpi->scaled_source.y_stride;
+
+ if (cpi->sf.mv.search_method == NSTEP) {
+ vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+ } else if (cpi->sf.mv.search_method == DIAMOND) {
+ vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+ }
+}
+
+void set_frame_size(VP9_COMP *cpi) {
+ int ref_frame;
VP9_COMMON *const cm = &cpi->common;
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ if (oxcf->pass == 2 &&
+ oxcf->rc_mode == VPX_VBR &&
+ ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+ (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+ calculate_coded_size(
+ cpi, &oxcf->scaled_frame_width, &oxcf->scaled_frame_height);
+
+ // There has been a change in frame size.
+ vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
+ }
+
+ if ((oxcf->pass == 2) &&
+ (!cpi->use_svc ||
+ (is_two_pass_svc(cpi) &&
+ cpi->svc.encode_empty_frame_state != ENCODING))) {
+ vp9_set_target_rate(cpi);
+ }
+
+ alloc_frame_mvs(cm, cm->new_fb_idx);
+
+ // Reset the frame pointers to the current frame size.
+ vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
+ cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL);
+
+ alloc_util_frame_buffers(cpi);
+ init_motion_estimation(cpi);
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+ ref_buf->idx = buf_idx;
+
+ if (buf_idx != INVALID_IDX) {
+ YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+ ref_buf->buf = buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+ buf->y_crop_width, buf->y_crop_height,
+ cm->width, cm->height,
+ (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+ 1 : 0);
+#else
+ vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+ buf->y_crop_width, buf->y_crop_height,
+ cm->width, cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ if (vp9_is_scaled(&ref_buf->sf))
+ vp9_extend_frame_borders(buf);
+ } else {
+ ref_buf->buf = NULL;
+ }
+ }
+
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void encode_without_recode_loop(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int q = 0, bottom_index = 0, top_index = 0; // Dummy variables.
+
vp9_clear_system_state();
+
+ set_frame_size(cpi);
+
+ cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+ &cpi->scaled_source);
+
+ if (cpi->unscaled_last_source != NULL)
+ cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source);
+
+ if (frame_is_intra_only(cm) == 0) {
+ vp9_scale_references(cpi);
+ }
+
+ set_size_independent_vars(cpi);
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
vp9_set_quantizer(cm, q);
+ vp9_set_vbp_thresholds(cpi, q);
+
setup_frame(cpi);
+
+ vp9_suppress_active_map(cpi);
// Variance adaptive and in frame q adjustment experiments are mutually
// exclusive.
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -2549,9 +3040,19 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
} else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
vp9_cyclic_refresh_setup(cpi);
}
+ vp9_apply_active_map(cpi);
+
// transform / motion compensation build reconstruction frame
vp9_encode_frame(cpi);
+ // Update some stats from cyclic refresh, and check if we should not update
+ // golden reference, for non-SVC 1 pass CBR.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cm->frame_type != KEY_FRAME &&
+ !cpi->use_svc &&
+ (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
+ vp9_cyclic_refresh_check_golden_update(cpi);
+
// Update the skip mb flag probabilities based on the distribution
// seen in the last encoder iteration.
// update_base_skip_probs(cpi);
@@ -2560,28 +3061,66 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
static void encode_with_recode_loop(VP9_COMP *cpi,
size_t *size,
- uint8_t *dest,
- int q,
- int bottom_index,
- int top_index) {
+ uint8_t *dest) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
+ int bottom_index, top_index;
int loop_count = 0;
+ int loop_at_this_size = 0;
int loop = 0;
int overshoot_seen = 0;
int undershoot_seen = 0;
- int q_low = bottom_index, q_high = top_index;
int frame_over_shoot_limit;
int frame_under_shoot_limit;
+ int q = 0, q_low = 0, q_high = 0;
- // Decide frame size bounds
- vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
- &frame_under_shoot_limit,
- &frame_over_shoot_limit);
+ set_size_independent_vars(cpi);
do {
vp9_clear_system_state();
+ set_frame_size(cpi);
+
+ if (loop_count == 0 || cpi->resize_pending != 0) {
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+
+ // Reset the loop state for new frame size.
+ overshoot_seen = 0;
+ undershoot_seen = 0;
+
+ // Reconfiguration for change in frame size has concluded.
+ cpi->resize_pending = 0;
+
+ q_low = bottom_index;
+ q_high = top_index;
+
+ loop_at_this_size = 0;
+ }
+
+ // Decide frame size bounds first time through.
+ if (loop_count == 0) {
+ vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ }
+
+ cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+ &cpi->scaled_source);
+
+ if (cpi->unscaled_last_source != NULL)
+ cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source);
+
+ if (frame_is_intra_only(cm) == 0) {
+ if (loop_count > 0) {
+ release_scaled_references(cpi);
+ }
+ vp9_scale_references(cpi);
+ }
+
vp9_set_quantizer(cm, q);
if (loop_count == 0)
@@ -2626,15 +3165,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
rc->this_key_frame_forced &&
(rc->projected_frame_size < rc->max_frame_bandwidth)) {
int last_q = q;
- int kf_err;
+ int64_t kf_err;
- int high_err_target = cpi->ambient_err;
- int low_err_target = cpi->ambient_err >> 1;
+ int64_t high_err_target = cpi->ambient_err;
+ int64_t low_err_target = cpi->ambient_err >> 1;
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm),
- cm->bit_depth);
+ kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
} else {
kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
}
@@ -2655,7 +3193,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
q_high = q > q_low ? q - 1 : q_low;
// Adjust Q
- q = (q * high_err_target) / kf_err;
+ q = (int)((q * high_err_target) / kf_err);
q = MIN(q, (q_high + q_low) >> 1);
} else if (kf_err < low_err_target &&
rc->projected_frame_size >= frame_under_shoot_limit) {
@@ -2664,7 +3202,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
q_low = q < q_high ? q + 1 : q_high;
// Adjust Q
- q = (q * low_err_target) / kf_err;
+ q = (int)((q * low_err_target) / kf_err);
q = MIN(q, (q_high + q_low + 1) >> 1);
}
@@ -2680,6 +3218,20 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
int last_q = q;
int retries = 0;
+ if (cpi->resize_pending == 1) {
+ // Change in frame size so go back around the recode loop.
+ cpi->rc.frame_size_selector =
+ SCALE_STEP1 - cpi->rc.frame_size_selector;
+ cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+ ++cpi->tot_recode_hits;
+#endif
+ ++loop_count;
+ loop = 1;
+ continue;
+ }
+
// Frame size out of permitted range:
// Update correction factor & compute new Q to try...
@@ -2692,20 +3244,20 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
// Raise Qlow as to at least the current value
q_low = q < q_high ? q + 1 : q_high;
- if (undershoot_seen || loop_count > 1) {
+ if (undershoot_seen || loop_at_this_size > 1) {
// Update rate_correction_factor unless
- vp9_rc_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low + 1) / 2;
} else {
// Update rate_correction_factor unless
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, MAX(q_high, top_index));
while (q < q_low && retries < 10) {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, MAX(q_high, top_index));
retries++;
@@ -2717,11 +3269,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
// Frame is too small
q_high = q > q_low ? q - 1 : q_low;
- if (overshoot_seen || loop_count > 1) {
- vp9_rc_update_rate_correction_factors(cpi, 1);
+ if (overshoot_seen || loop_at_this_size > 1) {
+ vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low) / 2;
} else {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, top_index);
// Special case reset for qlow for constrained quality.
@@ -2734,7 +3286,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
}
while (q > q_high && retries < 10) {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, top_index);
retries++;
@@ -2747,7 +3299,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
// Clamp Q to upper and lower limits:
q = clamp(q, q_low, q_high);
- loop = q != last_q;
+ loop = (q != last_q);
} else {
loop = 0;
}
@@ -2759,10 +3311,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
loop = 0;
if (loop) {
- loop_count++;
+ ++loop_count;
+ ++loop_at_this_size;
#if CONFIG_INTERNAL_STATS
- cpi->tot_recode_hits++;
+ ++cpi->tot_recode_hits;
#endif
}
} while (loop);
@@ -2778,7 +3331,9 @@ static int get_ref_frame_flags(const VP9_COMP *cpi) {
if (gold_is_last)
flags &= ~VP9_GOLD_FLAG;
- if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+ (cpi->svc.number_temporal_layers == 1 &&
+ cpi->svc.number_spatial_layers == 1))
flags &= ~VP9_GOLD_FLAG;
if (alt_is_last)
@@ -2823,25 +3378,6 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
}
}
-static int is_skippable_frame(const VP9_COMP *cpi) {
- // If the current frame does not have non-zero motion vector detected in the
- // first pass, and so do its previous and forward frames, then this frame
- // can be skipped for partition check, and the partition size is assigned
- // according to the variance
- const SVC *const svc = &cpi->svc;
- const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
- &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
-
- return (!frame_is_intra_only(&cpi->common) &&
- twopass->stats_in - 2 > twopass->stats_in_start &&
- twopass->stats_in < twopass->stats_in_end &&
- (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
- == 1 &&
- (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
- == 1 &&
- twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
static void set_arf_sign_bias(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
int arf_sign_bias;
@@ -2858,31 +3394,6 @@ static void set_arf_sign_bias(VP9_COMP *cpi) {
cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
}
-static void set_mv_search_params(VP9_COMP *cpi) {
- const VP9_COMMON *const cm = &cpi->common;
- const unsigned int max_mv_def = MIN(cm->width, cm->height);
-
- // Default based on max resolution.
- cpi->mv_step_param = vp9_init_search_range(max_mv_def);
-
- if (cpi->sf.mv.auto_mv_step_size) {
- if (frame_is_intra_only(cm)) {
- // Initialize max_mv_magnitude for use in the first INTER frame
- // after a key/intra-only frame.
- cpi->max_mv_magnitude = max_mv_def;
- } else {
- if (cm->show_frame)
- // Allow mv_steps to correspond to twice the max mv magnitude found
- // in the previous frame, capped by the default max_mv_magnitude based
- // on resolution.
- cpi->mv_step_param =
- vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
- cpi->max_mv_magnitude = 0;
- }
- }
-}
-
-
int setup_interp_filter_search_mask(VP9_COMP *cpi) {
INTERP_FILTER ifilter;
int ref_total[MAX_REF_FRAMES] = {0};
@@ -2917,43 +3428,21 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
struct segmentation *const seg = &cm->seg;
TX_SIZE t;
- int q;
- int top_index;
- int bottom_index;
set_ext_overrides(cpi);
-
- cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
- &cpi->scaled_source);
-
- if (cpi->unscaled_last_source != NULL)
- cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source);
-
- vp9_scale_references(cpi);
-
vp9_clear_system_state();
- // Enable or disable mode based tweaking of the zbin.
- // For 2 pass only used where GF/ARF prediction quality
- // is above a threshold.
- cpi->zbin_mode_boost = 0;
- cpi->zbin_mode_boost_enabled = 0;
-
// Set the arf sign bias for this frame.
set_arf_sign_bias(cpi);
// Set default state for segment based loop filter update flags.
cm->lf.mode_ref_delta_update = 0;
- set_mv_search_params(cpi);
-
if (cpi->oxcf.pass == 2 &&
cpi->sf.adaptive_interp_filter_search)
cpi->sf.interp_filter_search_mask =
setup_interp_filter_search_mask(cpi);
-
// Set various flags etc to special state if it is a key frame.
if (frame_is_intra_only(cm)) {
// Reset the loop filter deltas and segmentation map.
@@ -2969,6 +3458,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->rc.source_alt_ref_active = 0;
cm->error_resilient_mode = oxcf->error_resilient_mode;
+ cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
// By default, encoder assumes decoder can use prev_mi.
if (cm->error_resilient_mode) {
@@ -2976,7 +3466,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cm->reset_frame_context = 0;
cm->refresh_frame_context = 0;
} else if (cm->intra_only) {
- cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
// Only reset the current context.
cm->reset_frame_context = 2;
}
@@ -3013,20 +3502,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
}
- // Configure experimental use of segmentation for enhanced coding of
- // static regions if indicated.
- // Only allowed in second pass of two pass (as requires lagged coding)
- // and if the relevant speed feature flag is set.
- if (oxcf->pass == 2 && cpi->sf.static_segmentation)
- configure_static_seg_features(cpi);
-
- // Check if the current frame is skippable for the partition search in the
- // second pass according to the first pass stats
- if (cpi->sf.allow_partition_search_skip && oxcf->pass == 2 &&
- (!cpi->use_svc || is_two_pass_svc(cpi))) {
- cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
- }
-
// For 1 pass CBR, check if we are dropping this frame.
// Never drop on key frame.
if (oxcf->pass == 0 &&
@@ -3041,57 +3516,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_clear_system_state();
-#if CONFIG_VP9_POSTPROC
- if (oxcf->noise_sensitivity > 0) {
- int l = 0;
- switch (oxcf->noise_sensitivity) {
- case 1:
- l = 20;
- break;
- case 2:
- l = 40;
- break;
- case 3:
- l = 60;
- break;
- case 4:
- case 5:
- l = 100;
- break;
- case 6:
- l = 150;
- break;
- }
- vp9_denoise(cpi->Source, cpi->Source, l);
- }
-#endif
-
#if CONFIG_INTERNAL_STATS
- {
- int i;
- for (i = 0; i < MAX_MODES; ++i)
- cpi->mode_chosen_counts[i] = 0;
- }
+ memset(cpi->mode_chosen_counts, 0,
+ MAX_MODES * sizeof(*cpi->mode_chosen_counts));
#endif
- vp9_set_speed_features(cpi);
-
- vp9_set_rd_speed_thresholds(cpi);
- vp9_set_rd_speed_thresholds_sub8x8(cpi);
-
- // Decide q and q bounds.
- q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
-
- if (!frame_is_intra_only(cm)) {
- cm->interp_filter = cpi->sf.default_interp_filter;
- /* TODO: Decide this more intelligently */
- vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
- }
-
if (cpi->sf.recode_loop == DISALLOW_RECODE) {
- encode_without_recode_loop(cpi, q);
+ encode_without_recode_loop(cpi);
} else {
- encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
+ encode_with_recode_loop(cpi, size, dest);
}
#if CONFIG_VP9_TEMPORAL_DENOISING
@@ -3102,7 +3535,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
#endif
#endif
-
+#ifdef OUTPUT_YUV_SKINMAP
+ if (cpi->common.current_video_frame > 1) {
+ vp9_compute_skin_map(cpi, yuv_skinmap_file);
+ }
+#endif
// Special case code to reduce pulsing when key frames are forced at a
// fixed interval. Note the reconstruction error if it is the frame before
@@ -3111,8 +3548,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
- get_frame_new_buffer(cm),
- cm->bit_depth);
+ get_frame_new_buffer(cm));
} else {
cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
}
@@ -3136,11 +3572,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (cm->seg.update_map)
update_reference_segmentation_map(cpi);
- release_scaled_references(cpi);
+ if (frame_is_intra_only(cm) == 0) {
+ release_scaled_references(cpi);
+ }
vp9_update_reference_frames(cpi);
for (t = TX_4X4; t <= TX_32X32; t++)
- full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);
+ full_to_model_counts(cpi->td.counts->coef[t],
+ cpi->td.rd_counts.coef_counts[t]);
if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
vp9_adapt_coef_probs(cm);
@@ -3196,13 +3635,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (cm->show_frame) {
vp9_swap_mi_and_prev_mi(cm);
-
// Don't increment frame counters if this was an altref buffer
// update not a real frame
++cm->current_video_frame;
if (cpi->use_svc)
vp9_inc_frame_in_layer(cpi);
}
+ cm->prev_frame = cm->cur_frame;
if (is_two_pass_svc(cpi))
cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
@@ -3234,13 +3673,13 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
vp9_twopass_postencode_update(cpi);
}
-static void init_motion_estimation(VP9_COMP *cpi) {
- int y_stride = cpi->scaled_source.y_stride;
-
- if (cpi->sf.mv.search_method == NSTEP) {
- vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
- } else if (cpi->sf.mv.search_method == DIAMOND) {
- vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+ int i;
+ BufferPool *const pool = cm->buffer_pool;
+ cm->new_fb_idx = INVALID_IDX;
+ for (i = 0; i < REF_FRAMES; ++i) {
+ cm->ref_frame_map[i] = INVALID_IDX;
+ pool->frame_bufs[i].ref_count = 0;
}
}
@@ -3251,7 +3690,12 @@ static void check_initial_width(VP9_COMP *cpi,
int subsampling_x, int subsampling_y) {
VP9_COMMON *const cm = &cpi->common;
- if (!cpi->initial_width) {
+ if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth != use_highbitdepth ||
+#endif
+ cm->subsampling_x != subsampling_x ||
+ cm->subsampling_y != subsampling_y) {
cm->subsampling_x = subsampling_x;
cm->subsampling_y = subsampling_y;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -3259,16 +3703,31 @@ static void check_initial_width(VP9_COMP *cpi,
#endif
alloc_raw_frame_buffers(cpi);
- alloc_ref_frame_buffers(cpi);
+ init_ref_frame_bufs(cm);
alloc_util_frame_buffers(cpi);
- init_motion_estimation(cpi);
+ init_motion_estimation(cpi); // TODO(agrange) This can be removed.
cpi->initial_width = cm->width;
cpi->initial_height = cm->height;
+ cpi->initial_mbs = cm->MBs;
}
}
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ !cpi->denoiser.frame_buffer_initialized) {
+ vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS);
+ }
+}
+#endif
int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -3285,9 +3744,16 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
check_initial_width(cpi, subsampling_x, subsampling_y);
#endif // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
vpx_usec_timer_start(&timer);
- if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags))
+ if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ frame_flags))
res = -1;
vpx_usec_timer_mark(&timer);
cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -3295,13 +3761,13 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
(subsampling_x != 1 || subsampling_y != 1)) {
vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
- "Non-4:2:0 color space requires profile 1 or 3");
+ "Non-4:2:0 color format requires profile 1 or 3");
res = -1;
}
if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
(subsampling_x == 1 && subsampling_y == 1)) {
vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
- "4:2:0 color space requires profile 0 or 2");
+ "4:2:0 color format requires profile 0 or 2");
res = -1;
}
@@ -3402,19 +3868,33 @@ static void check_src_altref(VP9_COMP *cpi,
}
}
+#if CONFIG_INTERNAL_STATS
+extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch,
+ int width, int height);
+#endif
+
+void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) {
+ s->stat[Y] += y;
+ s->stat[U] += u;
+ s->stat[V] += v;
+ s->stat[ALL] += all;
+ s->worst = MIN(s->worst, all);
+}
+
int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
size_t *size, uint8_t *dest,
int64_t *time_stamp, int64_t *time_end, int flush) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ BufferPool *const pool = cm->buffer_pool;
RATE_CONTROL *const rc = &cpi->rc;
struct vpx_usec_timer cmptimer;
YV12_BUFFER_CONFIG *force_src_buffer = NULL;
struct lookahead_entry *last_source = NULL;
struct lookahead_entry *source = NULL;
- MV_REFERENCE_FRAME ref_frame;
int arf_src_index;
+ int i;
if (is_two_pass_svc(cpi)) {
#if CONFIG_SPATIAL_SVC
@@ -3431,6 +3911,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+ // Is multi-arf enabled.
+ // Note that at the moment multi_arf is only configured for 2 pass VBR and
+ // will not work properly with svc.
+ if ((oxcf->pass == 2) && !cpi->use_svc &&
+ (cpi->oxcf.enable_auto_arf > 1))
+ cpi->multi_arf_allowed = 1;
+ else
+ cpi->multi_arf_allowed = 0;
+
// Normal defaults
cm->reset_frame_context = 0;
cm->refresh_frame_context = 1;
@@ -3456,7 +3945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
int i;
// Reference a hidden frame from a lower layer
for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
- if (oxcf->ss_play_alternate[i]) {
+ if (oxcf->ss_enable_auto_arf[i]) {
cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
break;
}
@@ -3500,6 +3989,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
if (source != NULL) {
cm->show_frame = 1;
cm->intra_only = 0;
+ // if the flags indicate intra frame, but if the current picture is for
+ // non-zero spatial layer, it should not be an intra picture.
+ // TODO(Won Kap): this needs to change if per-layer intra frame is
+ // allowed.
+ if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id) {
+ source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
+ }
// Check to see if the frame should be encoded as an arf overlay.
check_src_altref(cpi, source);
@@ -3544,23 +4040,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
vp9_restore_layer_context(cpi);
}
- // start with a 0 size frame
- *size = 0;
-
- /* find a free buffer for the new frame, releasing the reference previously
- * held.
- */
- cm->frame_bufs[cm->new_fb_idx].ref_count--;
+ // Find a free buffer for the new frame, releasing the reference previously
+ // held.
+ if (cm->new_fb_idx != INVALID_IDX) {
+ --pool->frame_bufs[cm->new_fb_idx].ref_count;
+ }
cm->new_fb_idx = get_free_fb(cm);
- // For two pass encodes analyse the first pass stats and determine
- // the bit allocation and other parameters for this frame / group of frames.
- if ((oxcf->pass == 2) &&
- (!cpi->use_svc ||
- (is_two_pass_svc(cpi) &&
- cpi->svc.encode_empty_frame_state != ENCODING))) {
- vp9_rc_get_second_pass_params(cpi);
- }
+ if (cm->new_fb_idx == INVALID_IDX)
+ return -1;
+
+ cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
if (!cpi->use_svc && cpi->multi_arf_allowed) {
if (cm->frame_type == KEY_FRAME) {
@@ -3571,70 +4061,38 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
}
- cpi->frame_flags = *frame_flags;
-
- if (oxcf->pass == 2 &&
- cm->current_video_frame == 0 &&
- oxcf->allow_spatial_resampling &&
- oxcf->rc_mode == VPX_VBR) {
- // Internal scaling is triggered on the first frame.
- vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
- oxcf->scaled_frame_height);
- }
-
- // Reset the frame pointers to the current frame size
- vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
- cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+ // Start with a 0 size frame.
+ *size = 0;
- alloc_util_frame_buffers(cpi);
- init_motion_estimation(cpi);
+ cpi->frame_flags = *frame_flags;
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
- YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
- RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
- ref_buf->buf = buf;
- ref_buf->idx = idx;
-#if CONFIG_VP9_HIGHBITDEPTH
- vp9_setup_scale_factors_for_frame(&ref_buf->sf,
- buf->y_crop_width, buf->y_crop_height,
- cm->width, cm->height,
- (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
- 1 : 0);
-#else
- vp9_setup_scale_factors_for_frame(&ref_buf->sf,
- buf->y_crop_width, buf->y_crop_height,
- cm->width, cm->height);
-#endif // CONFIG_VP9_HIGHBITDEPTH
- if (vp9_is_scaled(&ref_buf->sf))
- vp9_extend_frame_borders(buf);
+ if ((oxcf->pass == 2) &&
+ (!cpi->use_svc ||
+ (is_two_pass_svc(cpi) &&
+ cpi->svc.encode_empty_frame_state != ENCODING))) {
+ vp9_rc_get_second_pass_params(cpi);
+ } else {
+ set_frame_size(cpi);
}
- set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
-
- if (oxcf->aq_mode == VARIANCE_AQ) {
- vp9_vaq_init();
- }
+ for (i = 0; i < MAX_REF_FRAMES; ++i)
+ cpi->scaled_ref_idx[i] = INVALID_IDX;
if (oxcf->pass == 1 &&
(!cpi->use_svc || is_two_pass_svc(cpi))) {
const int lossless = is_lossless_requested(oxcf);
#if CONFIG_VP9_HIGHBITDEPTH
if (cpi->oxcf.use_highbitdepth)
- cpi->mb.fwd_txm4x4 = lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+ cpi->td.mb.fwd_txm4x4 = lossless ?
+ vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
else
- cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
- cpi->mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
+ cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ cpi->td.mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
vp9_highbd_idct4x4_add;
#else
- cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
#endif // CONFIG_VP9_HIGHBITDEPTH
- cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+ cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
vp9_first_pass(cpi, source);
} else if (oxcf->pass == 2 &&
(!cpi->use_svc || is_two_pass_svc(cpi))) {
@@ -3647,10 +4105,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
if (cm->refresh_frame_context)
- cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
- // Frame was dropped, release scaled references.
- if (*size == 0) {
+ // No frame encoded, or frame was dropped, release scaled references.
+ if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
release_scaled_references(cpi);
}
@@ -3676,6 +4134,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#if CONFIG_INTERNAL_STATS
if (oxcf->pass != 1) {
+ double samples;
cpi->bytes += (int)(*size);
if (cm->show_frame) {
@@ -3687,47 +4146,55 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
PSNR_STATS psnr;
#if CONFIG_VP9_HIGHBITDEPTH
- calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+ calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
cpi->oxcf.input_bit_depth);
#else
calc_psnr(orig, recon, &psnr);
#endif // CONFIG_VP9_HIGHBITDEPTH
- cpi->total += psnr.psnr[0];
- cpi->total_y += psnr.psnr[1];
- cpi->total_u += psnr.psnr[2];
- cpi->total_v += psnr.psnr[3];
+ adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+ psnr.psnr[0], &cpi->psnr);
cpi->total_sq_error += psnr.sse[0];
cpi->total_samples += psnr.samples[0];
+ samples = psnr.samples[0];
{
PSNR_STATS psnr2;
double frame_ssim2 = 0, weight = 0;
#if CONFIG_VP9_POSTPROC
- // TODO(agrange) Add resizing of post-proc buffer in here when the
- // encoder is changed to use on-demand buffer allocation.
+ if (vp9_alloc_frame_buffer(&cm->post_proc_buffer,
+ recon->y_crop_width, recon->y_crop_height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment) < 0) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate post processing buffer");
+ }
+
vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
cm->lf.filter_level * 10 / 6);
#endif
vp9_clear_system_state();
#if CONFIG_VP9_HIGHBITDEPTH
- calc_highbd_psnr(orig, pp, &psnr, cpi->mb.e_mbd.bd,
+ calc_highbd_psnr(orig, pp, &psnr, cpi->td.mb.e_mbd.bd,
cpi->oxcf.input_bit_depth);
#else
calc_psnr(orig, pp, &psnr2);
#endif // CONFIG_VP9_HIGHBITDEPTH
- cpi->totalp += psnr2.psnr[0];
- cpi->totalp_y += psnr2.psnr[1];
- cpi->totalp_u += psnr2.psnr[2];
- cpi->totalp_v += psnr2.psnr[3];
cpi->totalp_sq_error += psnr2.sse[0];
cpi->totalp_samples += psnr2.samples[0];
+ adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
+ psnr2.psnr[0], &cpi->psnrp);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- frame_ssim2 = vp9_highbd_calc_ssim(orig, recon, &weight, xd->bd);
+ frame_ssim2 = vp9_highbd_calc_ssim(orig, recon, &weight,
+ (int)cm->bit_depth);
} else {
frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
}
@@ -3735,13 +4202,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
#endif // CONFIG_VP9_HIGHBITDEPTH
+ cpi->worst_ssim= MIN(cpi->worst_ssim, frame_ssim2);
cpi->summed_quality += frame_ssim2 * weight;
cpi->summed_weights += weight;
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
frame_ssim2 = vp9_highbd_calc_ssim(
- orig, &cm->post_proc_buffer, &weight, xd->bd);
+ orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
} else {
frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
}
@@ -3762,14 +4230,40 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#endif
}
}
+ if (cpi->b_calculate_blockiness) {
+ double frame_blockiness = vp9_get_blockiness(
+ cpi->Source->y_buffer, cpi->Source->y_stride,
+ cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+ cpi->Source->y_width, cpi->Source->y_height);
+ cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness);
+ cpi->total_blockiness += frame_blockiness;
+ }
+ if (cpi->b_calculate_consistency) {
+ double this_inconsistency = vp9_get_ssim_metrics(
+ cpi->Source->y_buffer, cpi->Source->y_stride,
+ cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+ cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
+ &cpi->metrics, 1);
+
+ const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+
+
+ double consistency = vpx_sse_to_psnr(samples, peak,
+ (double)cpi->total_inconsistency);
+
+ if (consistency > 0.0)
+ cpi->worst_consistency = MIN(cpi->worst_consistency,
+ consistency);
+ cpi->total_inconsistency += this_inconsistency;
+ }
if (cpi->b_calculate_ssimg) {
double y, u, v, frame_all;
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
frame_all = vp9_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
- &u, &v, xd->bd);
+ &u, &v, (int)cm->bit_depth);
} else {
frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
&v);
@@ -3777,10 +4271,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#else
frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
#endif // CONFIG_VP9_HIGHBITDEPTH
- cpi->total_ssimg_y += y;
- cpi->total_ssimg_u += u;
- cpi->total_ssimg_v += v;
- cpi->total_ssimg_all += frame_all;
+ adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
+ }
+ {
+ double y, u, v, frame_all;
+ frame_all = vp9_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+ &v);
+ adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+ /* TODO(JBB): add 10/12 bit support */
+ }
+ {
+ double y, u, v, frame_all;
+ frame_all = vp9_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+ adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
}
}
}
@@ -3833,29 +4336,6 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
}
}
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) {
- if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
- const int mi_rows = cpi->common.mi_rows;
- const int mi_cols = cpi->common.mi_cols;
- if (map) {
- int r, c;
- for (r = 0; r < mi_rows; r++) {
- for (c = 0; c < mi_cols; c++) {
- cpi->segmentation_map[r * mi_cols + c] =
- !map[(r >> 1) * cols + (c >> 1)];
- }
- }
- vp9_enable_segfeature(&cpi->common.seg, 1, SEG_LVL_SKIP);
- vp9_enable_segmentation(&cpi->common.seg);
- } else {
- vp9_disable_segmentation(&cpi->common.seg);
- }
- return 0;
- } else {
- return -1;
- }
-}
-
int vp9_set_internal_size(VP9_COMP *cpi,
VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
VP9_COMMON *cm = &cpi->common;
@@ -3882,11 +4362,15 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
unsigned int height) {
VP9_COMMON *cm = &cpi->common;
#if CONFIG_VP9_HIGHBITDEPTH
- check_initial_width(cpi, 1, 1, cm->use_highbitdepth);
+ check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
#else
check_initial_width(cpi, 1, 1);
#endif // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
if (width) {
cm->width = width;
if (cm->width > cpi->initial_width) {
@@ -3915,41 +4399,25 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
return;
}
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
assert(a->y_crop_width == b->y_crop_width);
assert(a->y_crop_height == b->y_crop_height);
- return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
+ return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
}
#if CONFIG_VP9_HIGHBITDEPTH
-int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- vpx_bit_depth_t bit_depth) {
- unsigned int sse;
- int sum;
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
assert(a->y_crop_width == b->y_crop_width);
assert(a->y_crop_height == b->y_crop_height);
assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
- switch (bit_depth) {
- case VPX_BITS_8:
- highbd_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height, &sse, &sum);
- return (int) sse;
- case VPX_BITS_10:
- highbd_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height, &sse, &sum);
- return (int) sse;
- case VPX_BITS_12:
- highbd_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height, &sse, &sum);
- return (int) sse;
- default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
- return -1;
- }
+
+ return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
index 2c56b81f360..41f1c13d493 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -17,9 +17,12 @@
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vp8cx.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_ppflags.h"
#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_thread.h"
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_context_tree.h"
@@ -31,10 +34,14 @@
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_ssim.h"
+#endif
#include "vp9/encoder/vp9_speed_features.h"
#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/encoder/vp9_variance.h"
+
#if CONFIG_VP9_TEMPORAL_DENOISING
#include "vp9/encoder/vp9_denoiser.h"
#endif
@@ -109,6 +116,11 @@ typedef enum {
AQ_MODE_COUNT // This should always be the last member of the enum
} AQ_MODE;
+typedef enum {
+ RESIZE_NONE = 0, // No frame resizing allowed (except for SVC).
+ RESIZE_FIXED = 1, // All frames are coded at the specified dimension.
+ RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
typedef struct VP9EncoderConfig {
BITSTREAM_PROFILE profile;
@@ -122,7 +134,12 @@ typedef struct VP9EncoderConfig {
int noise_sensitivity; // pre processing blur: recommendation 0
int sharpness; // sharpening output: recommendation 0:
int speed;
+ // maximum allowed bitrate for any intra frame in % of bitrate target.
unsigned int rc_max_intra_bitrate_pct;
+ // maximum allowed bitrate for any inter frame in % of bitrate target.
+ unsigned int rc_max_inter_bitrate_pct;
+ // percent of rate boost for golden frame in CBR mode.
+ unsigned int gf_cbr_boost_pct;
MODE mode;
int pass;
@@ -159,7 +176,7 @@ typedef struct VP9EncoderConfig {
AQ_MODE aq_mode; // Adaptive Quantization mode
// Internal frame size scaling.
- int allow_spatial_resampling;
+ RESIZE_TYPE resize_mode;
int scaled_frame_width;
int scaled_frame_height;
@@ -178,13 +195,12 @@ typedef struct VP9EncoderConfig {
int ts_number_layers; // Number of temporal layers.
// Bitrate allocation for spatial layers.
int ss_target_bitrate[VPX_SS_MAX_LAYERS];
- int ss_play_alternate[VPX_SS_MAX_LAYERS];
+ int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
// Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
int ts_target_bitrate[VPX_TS_MAX_LAYERS];
int ts_rate_decimator[VPX_TS_MAX_LAYERS];
- // these parameters aren't to be used in final build don't use!!!
- int play_alternate;
+ int enable_auto_arf;
int encode_breakout; // early breakout : for video conf recommend 800
@@ -206,6 +222,8 @@ typedef struct VP9EncoderConfig {
int tile_columns;
int tile_rows;
+ int max_threads;
+
vpx_fixed_buf_t two_pass_stats_in;
struct vpx_codec_pkt_list *output_pkt_list;
@@ -218,15 +236,62 @@ typedef struct VP9EncoderConfig {
#if CONFIG_VP9_HIGHBITDEPTH
int use_highbitdepth;
#endif
+ vpx_color_space_t color_space;
} VP9EncoderConfig;
static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
}
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+ TileInfo tile_info;
+ int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+ int mode_map[BLOCK_SIZES][MAX_MODES];
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+ vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+ int64_t comp_pred_diff[REFERENCE_MODES];
+ int64_t tx_select_diff[TX_MODES];
+ int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+ MACROBLOCK mb;
+ RD_COUNTS rd_counts;
+ FRAME_COUNTS *counts;
+
+ PICK_MODE_CONTEXT *leaf_tree;
+ PC_TREE *pc_tree;
+ PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+ int enabled;
+ int update;
+ unsigned char *map;
+} ActiveMap;
+
+typedef enum {
+ Y,
+ U,
+ V,
+ ALL
+} STAT_TYPE;
+
+typedef struct IMAGE_STAT {
+ double stat[ALL+1];
+ double worst;
+} ImageStat;
+
typedef struct VP9_COMP {
QUANTS quants;
- MACROBLOCK mb;
+ ThreadData td;
+ DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
VP9_COMMON common;
VP9EncoderConfig oxcf;
struct lookahead_ctx *lookahead;
@@ -239,10 +304,12 @@ typedef struct VP9_COMP {
YV12_BUFFER_CONFIG *unscaled_last_source;
YV12_BUFFER_CONFIG scaled_last_source;
+ TileDataEnc *tile_data;
+
// For a still frame, this flag is set to 1 to skip partition search.
int partition_search_skippable_frame;
- int scaled_ref_idx[3];
+ int scaled_ref_idx[MAX_REF_FRAMES];
int lst_fb_idx;
int gld_fb_idx;
int alt_fb_idx;
@@ -261,11 +328,11 @@ typedef struct VP9_COMP {
YV12_BUFFER_CONFIG last_frame_uf;
- TOKENEXTRA *tok;
+ TOKENEXTRA *tile_tok[4][1 << 6];
unsigned int tok_count[4][1 << 6];
// Ambient reconstruction err target for force key frames
- int ambient_err;
+ int64_t ambient_err;
RD_OPT rd;
@@ -276,9 +343,6 @@ typedef struct VP9_COMP {
int *nmvsadcosts[2];
int *nmvsadcosts_hp[2];
- int zbin_mode_boost;
- int zbin_mode_boost_enabled;
-
int64_t last_time_stamp_seen;
int64_t last_end_time_stamp_seen;
int64_t first_time_stamp_ever;
@@ -286,7 +350,6 @@ typedef struct VP9_COMP {
RATE_CONTROL rc;
double framerate;
- vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
struct vpx_codec_pkt_list *output_pkt_list;
@@ -301,6 +364,8 @@ typedef struct VP9_COMP {
unsigned int max_mv_magnitude;
int mv_step_param;
+ int allow_comp_inter_inter;
+
// Default value is 1. From first pass stats, encode_breakout may be disabled.
ENCODE_BREAKOUT_TYPE allow_encode_breakout;
@@ -313,13 +378,11 @@ typedef struct VP9_COMP {
// segment threashold for encode breakout
int segment_encode_breakout[MAX_SEGMENTS];
- unsigned char *complexity_map;
-
CYCLIC_REFRESH *cyclic_refresh;
+ ActiveMap active_map;
fractional_mv_step_fp *find_fractional_mv_step;
vp9_full_search_fn_t full_search_sad;
- vp9_refining_search_fn_t refining_search_sad;
vp9_diamond_search_fn_t diamond_search_sad;
vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
uint64_t time_receive_data;
@@ -340,19 +403,16 @@ typedef struct VP9_COMP {
unsigned int mode_chosen_counts[MAX_MODES];
int count;
- double total_y;
- double total_u;
- double total_v;
- double total;
uint64_t total_sq_error;
uint64_t total_samples;
+ ImageStat psnr;
- double totalp_y;
- double totalp_u;
- double totalp_v;
- double totalp;
uint64_t totalp_sq_error;
uint64_t totalp_samples;
+ ImageStat psnrp;
+
+ double total_blockiness;
+ double worst_blockiness;
int bytes;
double summed_quality;
@@ -360,14 +420,21 @@ typedef struct VP9_COMP {
double summedp_quality;
double summedp_weights;
unsigned int tot_recode_hits;
+ double worst_ssim;
-
- double total_ssimg_y;
- double total_ssimg_u;
- double total_ssimg_v;
- double total_ssimg_all;
+ ImageStat ssimg;
+ ImageStat fastssim;
+ ImageStat psnrhvs;
int b_calculate_ssimg;
+ int b_calculate_blockiness;
+
+ int b_calculate_consistency;
+
+ double total_inconsistency;
+ double worst_consistency;
+ Ssimv *ssim_vars;
+ Metrics metrics;
#endif
int b_calculate_psnr;
@@ -375,6 +442,10 @@ typedef struct VP9_COMP {
int initial_width;
int initial_height;
+ int initial_mbs; // Number of MBs in the full-size frame; to be used to
+ // normalize the firstpass stats. This will differ from the
+ // number of MBs in the current frame when the frame is
+ // scaled.
int use_svc;
@@ -395,10 +466,6 @@ typedef struct VP9_COMP {
int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
- PICK_MODE_CONTEXT *leaf_tree;
- PC_TREE *pc_tree;
- PC_TREE *pc_root;
int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
int multi_arf_allowed;
@@ -408,11 +475,28 @@ typedef struct VP9_COMP {
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_DENOISER denoiser;
#endif
+
+ int resize_pending;
+
+ // VAR_BASED_PARTITION thresholds
+ // 0 - threshold_64x64; 1 - threshold_32x32;
+ // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+ int64_t vbp_thresholds[4];
+ int64_t vbp_threshold_minmax;
+ int64_t vbp_threshold_sad;
+ BLOCK_SIZE vbp_bsize_min;
+
+ // Multi-threading
+ int num_workers;
+ VP9Worker *workers;
+ struct EncWorkerData *tile_thr_data;
+ VP9LfSync lf_row_sync;
} VP9_COMP;
-void vp9_initialize_enc();
+void vp9_initialize_enc(void);
-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+ BufferPool *const pool);
void vp9_remove_compressor(VP9_COMP *cpi);
void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
@@ -444,6 +528,8 @@ int vp9_update_entropy(VP9_COMP *cpi, int update);
int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
int vp9_set_internal_size(VP9_COMP *cpi,
VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
@@ -454,8 +540,14 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc);
int vp9_get_quantizer(struct VP9_COMP *cpi);
-static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
- MV_REFERENCE_FRAME ref_frame) {
+static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
+ return frame_is_intra_only(&cpi->common) ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
if (ref_frame == LAST_FRAME) {
return cpi->lst_fb_idx;
} else if (ref_frame == GOLDEN_FRAME) {
@@ -465,11 +557,19 @@ static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
}
}
+static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
+ int ref_frame) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
- VP9_COMMON * const cm = &cpi->common;
- return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
- .buf;
+ VP9_COMMON *const cm = &cpi->common;
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return
+ buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
}
static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
@@ -490,11 +590,10 @@ static INLINE int allocated_tokens(TileInfo tile) {
return get_token_alloc(tile_mb_rows, tile_mb_cols);
}
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
#if CONFIG_VP9_HIGHBITDEPTH
-int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b,
- vpx_bit_depth_t bit_depth);
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
#endif // CONFIG_VP9_HIGHBITDEPTH
void vp9_alloc_compressor_data(VP9_COMP *cpi);
@@ -513,16 +612,15 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
return cpi->use_svc &&
- (cpi->svc.number_temporal_layers > 1 ||
- cpi->svc.number_spatial_layers > 1) &&
- (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
+ ((cpi->svc.number_spatial_layers > 1) ||
+ (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.pass != 0));
}
static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
- (cpi->oxcf.play_alternate &&
+ (cpi->oxcf.enable_auto_arf &&
(!is_two_pass_svc(cpi) ||
- cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
+ cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
}
static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -542,6 +640,8 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
}
+void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
new file mode 100644
index 00000000000..8700ccdaecd
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+ int i, j, k, l, m, n;
+
+ for (i = 0; i < REFERENCE_MODES; i++)
+ td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+ for (i = 0; i < TX_MODES; i++)
+ td->rd_counts.tx_select_diff[i] += td_t->rd_counts.tx_select_diff[i];
+
+ for (i = 0; i < TX_SIZES; i++)
+ for (j = 0; j < PLANE_TYPES; j++)
+ for (k = 0; k < REF_TYPES; k++)
+ for (l = 0; l < COEF_BANDS; l++)
+ for (m = 0; m < COEFF_CONTEXTS; m++)
+ for (n = 0; n < ENTROPY_TOKENS; n++)
+ td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+ td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int t;
+
+ (void) unused;
+
+ for (t = thread_data->start; t < tile_rows * tile_cols;
+ t += cpi->num_workers) {
+ int tile_row = t / tile_cols;
+ int tile_col = t % tile_cols;
+
+ vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+ }
+
+ return 0;
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
+ int i;
+
+ vp9_init_tile_data(cpi);
+
+ // Only run once to create threads and allocate thread data.
+ if (cpi->num_workers == 0) {
+ CHECK_MEM_ERROR(cm, cpi->workers,
+ vpx_malloc(num_workers * sizeof(*cpi->workers)));
+
+ CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+ for (i = 0; i < num_workers; i++) {
+ VP9Worker *const worker = &cpi->workers[i];
+ EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+ ++cpi->num_workers;
+ winterface->init(worker);
+
+ if (i < num_workers - 1) {
+ thread_data->cpi = cpi;
+
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ vpx_memalign(32, sizeof(*thread_data->td)));
+ vp9_zero(*thread_data->td);
+
+ // Set up pc_tree.
+ thread_data->td->leaf_tree = NULL;
+ thread_data->td->pc_tree = NULL;
+ vp9_setup_pc_tree(cm, thread_data->td);
+
+ // Allocate frame counters in thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+ // Create threads
+ if (!winterface->reset(worker))
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->cpi = cpi;
+ thread_data->td = &cpi->td;
+ }
+
+ winterface->sync(worker);
+ }
+ }
+
+ for (i = 0; i < num_workers; i++) {
+ VP9Worker *const worker = &cpi->workers[i];
+ EncWorkerData *thread_data;
+
+ worker->hook = (VP9WorkerHook)enc_worker_hook;
+ worker->data1 = &cpi->tile_thr_data[i];
+ worker->data2 = NULL;
+ thread_data = (EncWorkerData*)worker->data1;
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ }
+ if (thread_data->td->counts != &cpi->common.counts) {
+ memcpy(thread_data->td->counts, &cpi->common.counts,
+ sizeof(cpi->common.counts));
+ }
+
+ // Handle use_nonrd_pick_mode case.
+ if (cpi->sf.use_nonrd_pick_mode) {
+ MACROBLOCK *const x = &thread_data->td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+ int j;
+
+ for (j = 0; j < MAX_MB_PLANE; ++j) {
+ p[j].coeff = ctx->coeff_pbuf[j][0];
+ p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+ pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+ p[j].eobs = ctx->eobs_pbuf[j][0];
+ }
+ }
+ }
+
+ // Encode a frame
+ for (i = 0; i < num_workers; i++) {
+ VP9Worker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i == num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+
+ // Encoding ends.
+ for (i = 0; i < num_workers; i++) {
+ VP9Worker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+
+ for (i = 0; i < num_workers; i++) {
+ VP9Worker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+ // Accumulate counters.
+ if (i < num_workers - 1) {
+ vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h
new file mode 100644
index 00000000000..e87c50bc712
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ETHREAD_H_
+#define VP9_ENCODER_VP9_ETHREAD_H_
+
+struct VP9_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+ struct VP9_COMP *cpi;
+ struct ThreadData *td;
+ int start;
+} EncWorkerData;
+
+void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+
+#endif // VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
index c9b2131426f..96f3598b1dc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
@@ -27,9 +27,9 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
uint8_t *dst_ptr2 = dst + w;
for (i = 0; i < h; i++) {
- vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
- vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w);
- vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+ memset(dst_ptr2, src_ptr2[0], extend_right);
src_ptr1 += src_pitch;
src_ptr2 += src_pitch;
dst_ptr1 += dst_pitch;
@@ -45,12 +45,12 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
linesize = extend_left + extend_right + w;
for (i = 0; i < extend_top; i++) {
- vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+ memcpy(dst_ptr1, src_ptr1, linesize);
dst_ptr1 += dst_pitch;
}
for (i = 0; i < extend_bottom; i++) {
- vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+ memcpy(dst_ptr2, src_ptr2, linesize);
dst_ptr2 += dst_pitch;
}
}
@@ -73,7 +73,7 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
for (i = 0; i < h; i++) {
vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
- vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
src_ptr1 += src_pitch;
src_ptr2 += src_pitch;
@@ -90,12 +90,12 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
linesize = extend_left + extend_right + w;
for (i = 0; i < extend_top; i++) {
- vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
dst_ptr1 += dst_pitch;
}
for (i = 0; i < extend_bottom; i++) {
- vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
dst_ptr2 += dst_pitch;
}
}
@@ -110,9 +110,9 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
// Motion estimation may use src block variance with the block size up
// to 64x64, so the right and bottom need to be extended to 64 multiple
// or up to 16, whichever is greater.
- const int eb_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
+ const int er_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
- src->y_crop_width;
- const int er_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
+ const int eb_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
- src->y_crop_height;
const int uv_width_subsampling = (src->uv_width != src->y_width);
const int uv_height_subsampling = (src->uv_height != src->y_height);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c
new file mode 100644
index 00000000000..f1d408cbe7b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * This code was originally written by: Nathan E. Egge, at the Daala
+ * project.
+ */
+#include <math.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/vp9_ssim.h"
+/* TODO(jbb): High bit depth version of this code needed */
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+ uint16_t *im1;
+ uint16_t *im2;
+ double *ssim;
+ int w;
+ int h;
+};
+
+struct fs_ctx {
+ fs_level *level;
+ int nlevels;
+ unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+ unsigned char *data;
+ size_t data_size;
+ int lw;
+ int lh;
+ int l;
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ data_size = _nlevels * sizeof(fs_level)
+ + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ im_size = lw * (size_t) lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size += im_size;
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ data_size += level_size;
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ data = (unsigned char *) malloc(data_size);
+ _ctx->level = (fs_level *) data;
+ _ctx->nlevels = _nlevels;
+ data += _nlevels * sizeof(*_ctx->level);
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ _ctx->level[l].w = lw;
+ _ctx->level[l].h = lh;
+ im_size = lw * (size_t) lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ _ctx->level[l].im1 = (uint16_t *) data;
+ _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+ data += level_size;
+ _ctx->level[l].ssim = (double *) data;
+ data += im_size * sizeof(*_ctx->level[l].ssim);
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ _ctx->col_buf = (unsigned *) data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) {
+ free(_ctx->level);
+}
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+ const uint16_t *src1;
+ const uint16_t *src2;
+ uint16_t *dst1;
+ uint16_t *dst2;
+ int w2;
+ int h2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ dst1 = _ctx->level[_l].im1;
+ dst2 = _ctx->level[_l].im2;
+ w2 = _ctx->level[_l - 1].w;
+ h2 = _ctx->level[_l - 1].h;
+ src1 = _ctx->level[_l - 1].im1;
+ src2 = _ctx->level[_l - 1].im2;
+ for (j = 0; j < h; j++) {
+ int j0offs;
+ int j1offs;
+ j0offs = 2 * j * w2;
+ j1offs = FS_MINI(2 * j + 1, h2) * w2;
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, w2);
+ dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
+ + src1[j1offs + i0] + src1[j1offs + i1];
+ dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
+ + src2[j1offs + i0] + src2[j1offs + i1];
+ }
+ }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const unsigned char *_src1,
+ int _s1ystride, const unsigned char *_src2,
+ int _s2ystride, int _w, int _h) {
+ uint16_t *dst1;
+ uint16_t *dst2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[0].w;
+ h = _ctx->level[0].h;
+ dst1 = _ctx->level[0].im1;
+ dst2 = _ctx->level[0].im2;
+ for (j = 0; j < h; j++) {
+ int j0;
+ int j1;
+ j0 = 2 * j;
+ j1 = FS_MINI(j0 + 1, _h);
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, _w);
+ dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
+ + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
+ + _src1[j1 * _s1ystride + i1];
+ dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
+ + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
+ + _src2[j1 * _s2ystride + i1];
+ }
+ }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l) {
+ unsigned *col_sums_x;
+ unsigned *col_sums_y;
+ uint16_t *im1;
+ uint16_t *im2;
+ double *ssim;
+ double c1;
+ int w;
+ int h;
+ int j0offs;
+ int j1offs;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ col_sums_x = _ctx->col_buf;
+ col_sums_y = col_sums_x + w;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ for (i = 0; i < w; i++)
+ col_sums_x[i] = 5 * im1[i];
+ for (i = 0; i < w; i++)
+ col_sums_y[i] = 5 * im2[i];
+ for (j = 1; j < 4; j++) {
+ j1offs = FS_MINI(j, h - 1) * w;
+ for (i = 0; i < w; i++)
+ col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++)
+ col_sums_y[i] += im2[j1offs + i];
+ }
+ ssim = _ctx->level[_l].ssim;
+ c1 = (double) (SSIM_C1 * 4096 * (1 << 4 * _l));
+ for (j = 0; j < h; j++) {
+ unsigned mux;
+ unsigned muy;
+ int i0;
+ int i1;
+ mux = 5 * col_sums_x[0];
+ muy = 5 * col_sums_y[0];
+ for (i = 1; i < 4; i++) {
+ i1 = FS_MINI(i, w - 1);
+ mux += col_sums_x[i1];
+ muy += col_sums_y[i1];
+ }
+ for (i = 0; i < w; i++) {
+ ssim[j * w + i] *= (2 * mux * (double) muy + c1)
+ / (mux * (double) mux + muy * (double) muy + c1);
+ if (i + 1 < w) {
+ i0 = FS_MAXI(0, i - 4);
+ i1 = FS_MINI(i + 4, w - 1);
+ mux += col_sums_x[i1] - col_sums_x[i0];
+ muy += col_sums_x[i1] - col_sums_x[i0];
+ }
+ }
+ if (j + 1 < h) {
+ j0offs = FS_MAXI(0, j - 4) * w;
+ for (i = 0; i < w; i++)
+ col_sums_x[i] -= im1[j0offs + i];
+ for (i = 0; i < w; i++)
+ col_sums_y[i] -= im2[j0offs + i];
+ j1offs = FS_MINI(j + 4, h - 1) * w;
+ for (i = 0; i < w; i++)
+ col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++)
+ col_sums_y[i] += im2[j1offs + i];
+ }
+ }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] = gx * (double)gx; \
+ col_sums_gy2[(_col)] = gy * (double)gy; \
+ col_sums_gxgy[(_col)] = gx * (double)gy; \
+ } \
+ while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] += gx * (double)gx; \
+ col_sums_gy2[(_col)] += gy * (double)gy; \
+ col_sums_gxgy[(_col)] += gx * (double)gy; \
+ } \
+ while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] -= gx * (double)gx; \
+ col_sums_gy2[(_col)] -= gy * (double)gy; \
+ col_sums_gxgy[(_col)] -= gx * (double)gy; \
+ } \
+ while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+ } \
+ while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+ } \
+ while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+ } \
+ while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l) {
+ uint16_t *im1;
+ uint16_t *im2;
+ unsigned *gx_buf;
+ unsigned *gy_buf;
+ double *ssim;
+ double col_sums_gx2[8];
+ double col_sums_gy2[8];
+ double col_sums_gxgy[8];
+ double c2;
+ int stride;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ ssim = _ctx->level[_l].ssim;
+ gx_buf = _ctx->col_buf;
+ stride = w + 8;
+ gy_buf = gx_buf + 8 * stride;
+ memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+ c2 = SSIM_C2 * (1 << 4 * _l) * 16 * 104;
+ for (j = 0; j < h + 4; j++) {
+ if (j < h - 1) {
+ for (i = 0; i < w - 1; i++) {
+ unsigned g1;
+ unsigned g2;
+ unsigned gx;
+ unsigned gy;
+ g1 = abs(im1[(j + 1) * w + i + 1] - im1[j * w + i]);
+ g2 = abs(im1[(j + 1) * w + i] - im1[j * w + i + 1]);
+ gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ g1 = abs(im2[(j + 1) * w + i + 1] - im2[j * w + i]);
+ g2 = abs(im2[(j + 1) * w + i] - im2[j * w + i + 1]);
+ gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ gx_buf[(j & 7) * stride + i + 4] = gx;
+ gy_buf[(j & 7) * stride + i + 4] = gy;
+ }
+ } else {
+ memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+ memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+ }
+ if (j >= 4) {
+ int k;
+ col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+ col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+ col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+ col_sums_gxgy[0] = 0;
+ for (i = 4; i < 8; i++) {
+ FS_COL_SET(i, -1, 0);
+ FS_COL_ADD(i, 0, 0);
+ for (k = 1; k < 8 - i; k++) {
+ FS_COL_DOUBLE(i, i);
+ FS_COL_ADD(i, -k - 1, 0);
+ FS_COL_ADD(i, k, 0);
+ }
+ }
+ for (i = 0; i < w; i++) {
+ double mugx2;
+ double mugy2;
+ double mugxgy;
+ mugx2 = col_sums_gx2[0];
+ for (k = 1; k < 8; k++)
+ mugx2 += col_sums_gx2[k];
+ mugy2 = col_sums_gy2[0];
+ for (k = 1; k < 8; k++)
+ mugy2 += col_sums_gy2[k];
+ mugxgy = col_sums_gxgy[0];
+ for (k = 1; k < 8; k++)
+ mugxgy += col_sums_gxgy[k];
+ ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+ if (i + 1 < w) {
+ FS_COL_SET(0, -1, 1);
+ FS_COL_ADD(0, 0, 1);
+ FS_COL_SUB(2, -3, 2);
+ FS_COL_SUB(2, 2, 2);
+ FS_COL_HALVE(1, 2);
+ FS_COL_SUB(3, -4, 3);
+ FS_COL_SUB(3, 3, 3);
+ FS_COL_HALVE(2, 3);
+ FS_COL_COPY(3, 4);
+ FS_COL_DOUBLE(4, 5);
+ FS_COL_ADD(4, -4, 5);
+ FS_COL_ADD(4, 3, 5);
+ FS_COL_DOUBLE(5, 6);
+ FS_COL_ADD(5, -3, 6);
+ FS_COL_ADD(5, 2, 6);
+ FS_COL_DOUBLE(6, 7);
+ FS_COL_ADD(6, -2, 7);
+ FS_COL_ADD(6, 1, 7);
+ FS_COL_SET(7, -1, 8);
+ FS_COL_ADD(7, 0, 8);
+ }
+ }
+ }
+ }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
+ 0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+ double *ssim;
+ double ret;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ ssim = _ctx->level[_l].ssim;
+ ret = 0;
+ for (j = 0; j < h; j++)
+ for (i = 0; i < w; i++)
+ ret += ssim[j * w + i];
+ return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double calc_ssim(const unsigned char *_src, int _systride,
+ const unsigned char *_dst, int _dystride, int _w, int _h) {
+ fs_ctx ctx;
+ double ret;
+ int l;
+ ret = 1;
+ fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+ fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h);
+ for (l = 0; l < FS_NLEVELS - 1; l++) {
+ fs_calc_structure(&ctx, l);
+ ret *= fs_average(&ctx, l);
+ fs_downsample_level(&ctx, l + 1);
+ }
+ fs_calc_structure(&ctx, l);
+ fs_apply_luminance(&ctx, l);
+ ret *= fs_average(&ctx, l);
+ fs_ctx_clear(&ctx);
+ return ret;
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+ return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+double vp9_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+ double *ssim_y, double *ssim_u, double *ssim_v) {
+ double ssimv;
+ vp9_clear_system_state();
+
+ *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height);
+
+ *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height);
+
+ *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height);
+ ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+
+ return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index f1baf8323af..9752668b15d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -38,6 +38,8 @@
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
+#define GROUP_ADAPTIVE_MAXQ 1
+
#define BOOST_BREAKOUT 12.5
#define BOOST_FACTOR 12.5
#define ERR_DIVISOR 128.0
@@ -49,10 +51,18 @@
#define KF_MAX_BOOST 128.0
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
-#define MIN_GF_INTERVAL 4
#define MIN_KF_BOOST 300
#define NEW_MV_MODE_PENALTY 32
#define SVC_FACTOR_PT_LOW 0.45
+#define DARK_THRESH 64
+#define DEFAULT_GRP_WEIGHT 1.0
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
@@ -60,12 +70,6 @@
unsigned int arf_count = 0;
#endif
-static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
- YV12_BUFFER_CONFIG temp = *a;
- *a = *b;
- *b = temp;
-}
-
// Resets the first pass file to the given position using a relative seek from
// the current position.
static void reset_fpf_position(TWO_PASS *p,
@@ -106,10 +110,11 @@ static void output_stats(FIRSTPASS_STATS *stats,
FILE *fpfile;
fpfile = fopen("firstpass.stt", "a");
- fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
- "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
- "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
+ fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+ "%12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
stats->frame,
+ stats->weight,
stats->intra_error,
stats->coded_error,
stats->sr_coded_error,
@@ -138,13 +143,14 @@ static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
struct vpx_codec_cx_pkt pkt;
pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
- pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t);
+ pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
vpx_codec_pkt_list_add(pktlist, &pkt);
}
#endif
static void zero_stats(FIRSTPASS_STATS *section) {
- section->frame = 0.0;
+ section->frame = 0.0;
+ section->weight = 0.0;
section->intra_error = 0.0;
section->coded_error = 0.0;
section->sr_coded_error = 0.0;
@@ -168,6 +174,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
static void accumulate_stats(FIRSTPASS_STATS *section,
const FIRSTPASS_STATS *frame) {
section->frame += frame->frame;
+ section->weight += frame->weight;
section->spatial_layer_id = frame->spatial_layer_id;
section->intra_error += frame->intra_error;
section->coded_error += frame->coded_error;
@@ -191,6 +198,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
static void subtract_stats(FIRSTPASS_STATS *section,
const FIRSTPASS_STATS *frame) {
section->frame -= frame->frame;
+ section->weight -= frame->weight;
section->intra_error -= frame->intra_error;
section->coded_error -= frame->coded_error;
section->sr_coded_error -= frame->sr_coded_error;
@@ -217,10 +225,11 @@ static double calculate_modified_err(const TWO_PASS *twopass,
const VP9EncoderConfig *oxcf,
const FIRSTPASS_STATS *this_frame) {
const FIRSTPASS_STATS *const stats = &twopass->total_stats;
- const double av_err = stats->coded_error / stats->count;
- const double modified_error = av_err *
- pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err),
- oxcf->two_pass_vbrbias / 100.0);
+ const double av_weight = stats->weight / stats->count;
+ const double av_err = (stats->coded_error * av_weight) / stats->count;
+ const double modified_error =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0);
return fclamp(modified_error,
twopass->modified_error_min, twopass->modified_error_max);
}
@@ -332,9 +341,9 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
// Refine the motion search range according to the frame dimension
// for first pass test.
-static int get_search_range(const VP9_COMMON *cm) {
+static int get_search_range(const VP9_COMP *cpi) {
int sr = 0;
- const int dim = MIN(cm->width, cm->height);
+ const int dim = MIN(cpi->initial_width, cpi->initial_height);
while ((dim << sr) < MAX_FULL_PEL_VAL)
++sr;
@@ -348,13 +357,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
MV tmp_mv = {0, 0};
MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
int num00, tmp_err, n;
- const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
int step_param = 3;
int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
- const int sr = get_search_range(&cpi->common);
+ const int sr = get_search_range(cpi);
step_param += sr;
further_steps -= sr;
@@ -444,22 +453,16 @@ static void set_first_pass_params(VP9_COMP *cpi) {
void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
int mb_row, mb_col;
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
TileInfo tile;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+ const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
int i;
int recon_yoffset, recon_uvoffset;
- YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
- YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
- YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
- int recon_y_stride = lst_yv12->y_stride;
- int recon_uv_stride = lst_yv12->uv_stride;
- int uv_mb_height = 16 >> (lst_yv12->y_height > lst_yv12->uv_height);
int64_t intra_error = 0;
int64_t coded_error = 0;
int64_t sr_coded_error = 0;
@@ -471,24 +474,41 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
int intercount = 0;
int second_ref_count = 0;
const int intrapenalty = INTRA_MODE_PENALTY;
- int neutral_count = 0;
+ double neutral_count;
int new_mv_count = 0;
int sum_in_vectors = 0;
MV lastmv = {0, 0};
TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = {0, 0};
+ int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+ YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
&cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
+ double intra_factor;
+ double brightness_factor;
+ BufferPool *const pool = cm->buffer_pool;
+
+ // First pass code requires valid last and new frame buffers.
+ assert(new_yv12 != NULL);
+ assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs);
+ vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
}
#endif
vp9_clear_system_state();
+ intra_factor = 0.0;
+ brightness_factor = 0.0;
+ neutral_count = 0.0;
+
set_first_pass_params(cpi);
vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
@@ -521,20 +541,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
}
if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- const int ref_idx =
- cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
- const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
-
- gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf :
- get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ if (gld_yv12 == NULL) {
+ gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ }
} else {
gld_yv12 = NULL;
}
- recon_y_stride = new_yv12->y_stride;
- recon_uv_stride = new_yv12->uv_stride;
- uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
-
set_ref_ptrs(cm, xd,
(cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
(cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
@@ -546,11 +560,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
vp9_setup_src_planes(x, cpi->Source, 0, 0);
- vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
- xd->mi = cm->mi;
- xd->mi[0].src_mi = &xd->mi[0];
+ if (!frame_is_intra_only(cm)) {
+ vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
vp9_frame_init_quantizer(cpi);
@@ -568,6 +585,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
// Tiling is ignored in the first pass.
vp9_tile_init(&tile, cm, 0, 0);
+ recon_y_stride = new_yv12->y_stride;
+ recon_uv_stride = new_yv12->uv_stride;
+ uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
MV best_ref_mv = {0, 0};
@@ -585,8 +606,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
int this_error;
const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
- double error_weight = 1.0;
const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+ double log_intra;
+ int level_sample;
+
#if CONFIG_FP_MB_STATS
const int mb_index = mb_row * cm->mb_cols + mb_col;
#endif
@@ -597,22 +620,17 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
xd->left_available = (mb_col != 0);
- xd->mi[0].src_mi->mbmi.sb_type = bsize;
- xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->mbmi.sb_type = bsize;
+ xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
set_mi_row_col(xd, &tile,
mb_row << 1, num_8x8_blocks_high_lookup[bsize],
mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
cm->mi_rows, cm->mi_cols);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- const int energy = vp9_block_energy(cpi, x, bsize);
- error_weight = vp9_vaq_inv_q_ratio(energy);
- }
-
// Do intra 16x16 prediction.
x->skip_encode = 0;
- xd->mi[0].src_mi->mbmi.mode = DC_PRED;
- xd->mi[0].src_mi->mbmi.tx_size = use_dc_pred ?
+ xd->mi[0]->mbmi.mode = DC_PRED;
+ xd->mi[0]->mbmi.tx_size = use_dc_pred ?
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
vp9_encode_intra_block_plane(x, bsize, 0);
this_error = vp9_get_mb_ss(x->plane[0].src_diff);
@@ -635,10 +653,25 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
}
#endif // CONFIG_VP9_HIGHBITDEPTH
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_clear_system_state();
- this_error = (int)(this_error * error_weight);
- }
+ vp9_clear_system_state();
+ log_intra = log(this_error + 1.0);
+ if (log_intra < 10.0)
+ intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+ else
+ intra_factor += 1.0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ else
+ level_sample = x->plane[0].src.buf[0];
+#else
+ level_sample = x->plane[0].src.buf[0];
+#endif
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+ brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+ else
+ brightness_factor += 1.0;
// Intrapenalty below deals with situations where the intra and inter
// error scores are very low (e.g. a plain black frame).
@@ -711,20 +744,12 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
// Test last reference frame using the previous best mv as the
// starting point (best reference) for the search.
first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_clear_system_state();
- motion_error = (int)(motion_error * error_weight);
- }
// If the current best reference mv is not centered on 0,0 then do a
// 0,0 based search as well.
if (!is_zero_mv(&best_ref_mv)) {
tmp_err = INT_MAX;
first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_clear_system_state();
- tmp_err = (int)(tmp_err * error_weight);
- }
if (tmp_err < motion_error) {
motion_error = tmp_err;
@@ -755,10 +780,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
&gf_motion_error);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_clear_system_state();
- gf_motion_error = (int)(gf_motion_error * error_weight);
- }
if (gf_motion_error < motion_error && gf_motion_error < this_error)
++second_ref_count;
@@ -802,21 +823,30 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
#endif
if (motion_error <= this_error) {
+ vp9_clear_system_state();
+
// Keep a count of cases where the inter and intra were very close
// and very low. This helps with scene cut detection for example in
// cropped clips with black bars at the sides or top and bottom.
if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
- this_error < 2 * intrapenalty)
- ++neutral_count;
+ (this_error < (2 * intrapenalty))) {
+ neutral_count += 1.0;
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+ (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ neutral_count += (double)motion_error /
+ DOUBLE_DIVIDE_CHECK((double)this_error);
+ }
mv.row *= 8;
mv.col *= 8;
this_error = motion_error;
- xd->mi[0].src_mi->mbmi.mode = NEWMV;
- xd->mi[0].src_mi->mbmi.mv[0].as_mv = mv;
- xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
- xd->mi[0].src_mi->mbmi.ref_frame[0] = LAST_FRAME;
- xd->mi[0].src_mi->mbmi.ref_frame[1] = NONE;
+ xd->mi[0]->mbmi.mode = NEWMV;
+ xd->mi[0]->mbmi.mv[0].as_mv = mv;
+ xd->mi[0]->mbmi.tx_size = TX_4X4;
+ xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->mbmi.ref_frame[1] = NONE;
vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
vp9_encode_sby_pass1(x, bsize);
sum_mvr += mv.row;
@@ -931,15 +961,20 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
vp9_clear_system_state();
}
- vp9_clear_system_state();
{
FIRSTPASS_STATS fps;
- // The minimum error here insures some bit alocation to frames even
+ // The minimum error here insures some bit allocation to frames even
// in static regions. The allocation per MB declines for larger formats
// where the typical "real" energy per MB also falls.
// Initial estimate here uses sqrt(mbs) to define the min_err, where the
- // number of mbs is propotional to image area.
- const double min_err = 200 * sqrt(cm->MBs);
+ // number of mbs is proportional to the image area.
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs : cpi->common.MBs;
+ const double min_err = 200 * sqrt(num_mbs);
+
+ intra_factor = intra_factor / (double)num_mbs;
+ brightness_factor = brightness_factor / (double)num_mbs;
+ fps.weight = intra_factor * brightness_factor;
fps.frame = cm->current_video_frame;
fps.spatial_layer_id = cpi->svc.spatial_layer_id;
@@ -947,9 +982,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
fps.intra_error = (double)(intra_error >> 8) + min_err;
fps.count = 1.0;
- fps.pcnt_inter = (double)intercount / cm->MBs;
- fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
- fps.pcnt_neutral = (double)neutral_count / cm->MBs;
+ fps.pcnt_inter = (double)intercount / num_mbs;
+ fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+ fps.pcnt_neutral = (double)neutral_count / num_mbs;
if (mvcount > 0) {
fps.MVr = (double)sum_mvr / mvcount;
@@ -960,7 +995,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
fps.new_mv_count = new_mv_count;
- fps.pcnt_motion = (double)mvcount / cm->MBs;
+ fps.pcnt_motion = (double)mvcount / num_mbs;
} else {
fps.MVr = 0.0;
fps.mvr_abs = 0.0;
@@ -998,7 +1033,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
((twopass->this_frame_stats.intra_error /
DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
if (gld_yv12 != NULL) {
- vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idx]);
}
twopass->sr_update_lag = 1;
} else {
@@ -1010,14 +1046,17 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
if (lc != NULL) {
vp9_update_reference_frames(cpi);
} else {
- // Swap frame pointers so last frame refers to the frame we just compressed.
- swap_yv12(lst_yv12, new_yv12);
+ // The frame we just compressed now becomes the last frame.
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+ cm->new_fb_idx);
}
// Special case for the first frame. Copy into the GF buffer as a second
// reference.
- if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
- vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+ if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
+ lc == NULL) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idx]);
}
// Use this to see what the first pass reconstruction looks like.
@@ -1066,16 +1105,17 @@ static double calc_correction_factor(double err_per_mb,
#define EDIV_SIZE_FACTOR 800
static int get_twopass_worst_quality(const VP9_COMP *cpi,
- const FIRSTPASS_STATS *stats,
- int section_target_bandwidth) {
+ const double section_err,
+ int section_target_bandwidth,
+ double group_weight_factor) {
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
if (section_target_bandwidth <= 0) {
return rc->worst_quality; // Highest value allowed
} else {
- const int num_mbs = cpi->common.MBs;
- const double section_err = stats->coded_error / stats->count;
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs : cpi->common.MBs;
const double err_per_mb = section_err / num_mbs;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
const double ediv_size_correction = num_mbs / EDIV_SIZE_FACTOR;
@@ -1084,20 +1124,24 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
int q;
int is_svc_upper_layer = 0;
+
if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
is_svc_upper_layer = 1;
+
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
for (q = rc->best_quality; q < rc->worst_quality; ++q) {
const double factor =
- calc_correction_factor(err_per_mb, ERR_DIVISOR - ediv_size_correction,
+ calc_correction_factor(err_per_mb,
+ ERR_DIVISOR - ediv_size_correction,
is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
cpi->common.bit_depth);
- const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
- factor * speed_term,
- cpi->common.bit_depth);
+ const int bits_per_mb =
+ vp9_rc_bits_per_mb(INTER_FRAME, q,
+ factor * speed_term * group_weight_factor,
+ cpi->common.bit_depth);
if (bits_per_mb <= target_norm_bits_per_mb)
break;
}
@@ -1109,7 +1153,38 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
}
}
-extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+static void setup_rf_level_maxq(VP9_COMP *cpi) {
+ int i;
+ RATE_CONTROL *const rc = &cpi->rc;
+ for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+ int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality);
+ rc->rf_level_maxq[i] = MAX(rc->worst_quality + qdelta, rc->best_quality);
+ }
+}
+
+void vp9_init_subsampling(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int w = cm->width;
+ const int h = cm->height;
+ int i;
+
+ for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+ // Note: Frames with odd-sized dimensions may result from this scaling.
+ rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+ rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+ }
+
+ setup_rf_level_maxq(cpi);
+}
+
+void calculate_coded_size(VP9_COMP *cpi,
+ int *scaled_frame_width,
+ int *scaled_frame_height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+ *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
void vp9_init_second_pass(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
@@ -1179,6 +1254,10 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// Static sequence monitor variables.
twopass->kf_zeromotion_pct = 100;
twopass->last_kfgroup_zeromotion_pct = 100;
+
+ if (oxcf->resize_mode != RESIZE_NONE) {
+ vp9_init_subsampling(cpi);
+ }
}
#define SR_DIFF_PART 0.0015
@@ -1188,38 +1267,50 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
#define LOW_SR_DIFF_TRHESH 0.1
#define SR_DIFF_MAX 128.0
-static double get_sr_decay_rate(const VP9_COMMON *cm,
+static double get_sr_decay_rate(const VP9_COMP *cpi,
const FIRSTPASS_STATS *frame) {
- double sr_diff = (frame->sr_coded_error - frame->coded_error) / cm->MBs;
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs : cpi->common.MBs;
+ double sr_diff =
+ (frame->sr_coded_error - frame->coded_error) / num_mbs;
double sr_decay = 1.0;
+ double modified_pct_inter;
+ double modified_pcnt_intra;
const double motion_amplitude_factor =
frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
- const double pcnt_intra = 100 * (1.0 - frame->pcnt_inter);
+
+ modified_pct_inter = frame->pcnt_inter;
+ if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH) {
+ modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
sr_diff = MIN(sr_diff, SR_DIFF_MAX);
sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
(MOTION_AMP_PART * motion_amplitude_factor) -
- (INTRA_PART * pcnt_intra);
+ (INTRA_PART * modified_pcnt_intra);
}
- return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, frame->pcnt_inter));
+ return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
}
// This function gives an estimate of how badly we believe the prediction
// quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
+static double get_zero_motion_factor(const VP9_COMP *cpi,
const FIRSTPASS_STATS *frame) {
const double zero_motion_pct = frame->pcnt_inter -
frame->pcnt_motion;
- double sr_decay = get_sr_decay_rate(cm, frame);
+ double sr_decay = get_sr_decay_rate(cpi, frame);
return MIN(sr_decay, zero_motion_pct);
}
#define ZM_POWER_FACTOR 0.75
-static double get_prediction_decay_rate(const VP9_COMMON *cm,
+static double get_prediction_decay_rate(const VP9_COMP *cpi,
const FIRSTPASS_STATS *next_frame) {
- const double sr_decay_rate = get_sr_decay_rate(cm, next_frame);
+ const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
const double zero_motion_factor =
(0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
ZM_POWER_FACTOR));
@@ -1231,14 +1322,17 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm,
// Function to test for a condition where a complex transition is followed
// by a static section. For example in slide shows where there is a fade
// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(const TWO_PASS *twopass,
+static int detect_transition_to_still(VP9_COMP *cpi,
int frame_interval, int still_interval,
double loop_decay_rate,
double last_decay_rate) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+
// Break clause to detect very still sections after motion
// For example a static image after a fade or other transition
// instead of a clean scene cut.
- if (frame_interval > MIN_GF_INTERVAL &&
+ if (frame_interval > rc->min_gf_interval &&
loop_decay_rate >= 0.999 &&
last_decay_rate < 0.9) {
int j;
@@ -1313,12 +1407,14 @@ static double calc_frame_boost(VP9_COMP *cpi,
const double lq =
vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
cpi->common.bit_depth);
- const double boost_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+ const double boost_q_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs : cpi->common.MBs;
// Underlying boost factor is based on inter error ratio.
- frame_boost = (BASELINE_ERR_PER_MB * cpi->common.MBs) /
+ frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
- frame_boost = frame_boost * BOOST_FACTOR * boost_correction;
+ frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
// Increase boost for frames where new data coming into frame (e.g. zoom out).
// Slightly reduce boost if there is a net balance of motion out of the frame
@@ -1329,7 +1425,7 @@ static double calc_frame_boost(VP9_COMP *cpi,
else
frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
- return MIN(frame_boost, max_boost * boost_correction);
+ return MIN(frame_boost, max_boost * boost_q_correction);
}
static int calc_arf_boost(VP9_COMP *cpi, int offset,
@@ -1365,7 +1461,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
// Accumulate the effect of prediction quality decay.
if (!flash_detected) {
- decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
? MIN_DECAY_FACTOR : decay_accumulator;
}
@@ -1404,7 +1500,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
// Cumulative effect of prediction quality decay.
if (!flash_detected) {
- decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
? MIN_DECAY_FACTOR : decay_accumulator;
}
@@ -1666,8 +1762,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
// Analyse and define a gf/arf group.
static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
- const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
TWO_PASS *const twopass = &cpi->twopass;
FIRSTPASS_STATS next_frame;
const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
@@ -1676,6 +1773,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double boost_score = 0.0;
double old_boost_score = 0.0;
double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+ double gf_group_raw_error = 0.0;
+#endif
double gf_first_frame_err = 0.0;
double mod_frame_err = 0.0;
@@ -1700,10 +1800,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int64_t gf_group_bits;
double gf_group_error_left;
int gf_arf_bits;
+ int is_key_frame = frame_is_intra_only(cm);
// Reset the GF group data structures unless this is a key
// frame in which case it will already have been done.
- if (cpi->common.frame_type != KEY_FRAME) {
+ if (is_key_frame == 0) {
vp9_zero(twopass->gf_group);
}
@@ -1719,11 +1820,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// If this is a key frame or the overlay from a previous arf then
// the error score / cost of this frame has already been accounted for.
- if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+ if (is_key_frame || rc->source_alt_ref_active) {
gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error -= this_frame->coded_error;
+#endif
+ }
// Motion breakout threshold for loop below depends on image size.
- mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 4.0;
+ mv_ratio_accumulator_thresh =
+ (cpi->initial_height + cpi->initial_width) / 4.0;
// Set a maximum and minimum interval for the GF group.
// If the image appears almost completely static we can extend beyond this.
@@ -1734,7 +1840,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int int_lbq =
(int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
cpi->common.bit_depth));
- active_min_gf_interval = MIN_GF_INTERVAL + MIN(2, int_max_q / 200);
+ active_min_gf_interval = rc->min_gf_interval + MIN(2, int_max_q / 200);
if (active_min_gf_interval > rc->max_gf_interval)
active_min_gf_interval = rc->max_gf_interval;
@@ -1748,6 +1854,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
if (active_max_gf_interval > rc->max_gf_interval)
active_max_gf_interval = rc->max_gf_interval;
+ if (active_max_gf_interval < active_min_gf_interval)
+ active_max_gf_interval = active_min_gf_interval;
}
}
@@ -1758,6 +1866,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Accumulate error score of frames in this gf group.
mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
if (EOF == input_stats(twopass, &next_frame))
break;
@@ -1775,18 +1886,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Accumulate the effect of prediction quality decay.
if (!flash_detected) {
last_loop_decay_rate = loop_decay_rate;
- loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
decay_accumulator = decay_accumulator * loop_decay_rate;
// Monitor for static sections.
zero_motion_accumulator =
- MIN(zero_motion_accumulator,
- get_zero_motion_factor(&cpi->common, &next_frame));
+ MIN(zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
// Break clause to detect very still sections after motion. For example,
// a static image after a fade or other transition.
- if (detect_transition_to_still(twopass, i, 5, loop_decay_rate,
+ if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
last_loop_decay_rate)) {
allow_alt_ref = 0;
break;
@@ -1820,8 +1930,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+ // Was the group length constrained by the requirement for a new KF?
+ rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
// Set the interval until the next gf.
- if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+ if (is_key_frame || rc->source_alt_ref_active)
rc->baseline_gf_interval = i - 1;
else
rc->baseline_gf_interval = i;
@@ -1837,6 +1950,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (EOF == input_stats(twopass, this_frame))
break;
gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
}
rc->baseline_gf_interval = new_gf_interval;
}
@@ -1846,7 +1962,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Should we use the alternate reference frame.
if (allow_alt_ref &&
(i < cpi->oxcf.lag_in_frames) &&
- (i >= MIN_GF_INTERVAL)) {
+ (i >= rc->min_gf_interval)) {
// Calculate the boost for alt ref.
rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
&b_boost);
@@ -1867,6 +1983,34 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Calculate the bits to be allocated to the gf/arf group as a whole
gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more agressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+ const int vbr_group_bits_per_frame =
+ (int)(gf_group_bits / rc->baseline_gf_interval);
+ const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+ int tmp_q;
+ // rc factor is a weight factor that corrects for local rate control drift.
+ double rc_factor = 1.0;
+ if (rc->rate_error_estimate > 0) {
+ rc_factor = MAX(RC_FACTOR_MIN,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ } else {
+ rc_factor = MIN(RC_FACTOR_MAX,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ }
+ tmp_q =
+ get_twopass_worst_quality(cpi, group_av_err, vbr_group_bits_per_frame,
+ twopass->kfgroup_inter_fraction * rc_factor);
+ twopass->active_worst_quality =
+ MAX(tmp_q, twopass->active_worst_quality >> 1);
+ }
+#endif
+
// Calculate the extra bits to be used for boosted frame(s)
gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
rc->gfu_boost, gf_group_bits);
@@ -1882,7 +2026,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// also a key frame in which case it has already been accounted for.
if (rc->source_alt_ref_pending) {
gf_group_error_left = gf_group_err - mod_frame_err;
- } else if (cpi->common.frame_type != KEY_FRAME) {
+ } else if (is_key_frame == 0) {
gf_group_error_left = gf_group_err - gf_first_frame_err;
} else {
gf_group_error_left = gf_group_err;
@@ -1900,31 +2044,68 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
calculate_section_intra_ratio(start_pos, twopass->stats_in_end,
rc->baseline_gf_interval);
}
+
+ if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+ // Default to starting GF groups at normal frame size.
+ cpi->rc.next_frame_size_selector = UNSCALED;
+ }
}
-// TODO(PGW) Re-examine the use of II ration in this code in the light of#
-// changes elsewhere
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
#define KF_II_MAX 128.0
+
static int test_candidate_kf(TWO_PASS *twopass,
const FIRSTPASS_STATS *last_frame,
const FIRSTPASS_STATS *this_frame,
const FIRSTPASS_STATS *next_frame) {
int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+ double modified_pcnt_inter =
+ this_frame->pcnt_inter - this_frame->pcnt_neutral;
// Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
// If so, then examine how well it predicts subsequent frames.
- if ((this_frame->pcnt_second_ref < 0.10) &&
- (next_frame->pcnt_second_ref < 0.10) &&
- ((this_frame->pcnt_inter < 0.05) ||
- (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
+ if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ ((pcnt_intra > MIN_INTRA_LEVEL) &&
+ (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
((this_frame->intra_error /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+ KF_II_ERR_THRESHOLD) &&
((fabs(last_frame->coded_error - this_frame->coded_error) /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+ ERR_CHANGE_THRESHOLD) ||
(fabs(last_frame->intra_error - this_frame->intra_error) /
- DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
+ DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+ ERR_CHANGE_THRESHOLD) ||
((next_frame->intra_error /
- DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
+ DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+ II_IMPROVEMENT_THRESHOLD))))) {
int i;
const FIRSTPASS_STATS *start_pos = twopass->stats_in;
FIRSTPASS_STATS local_next_frame = *next_frame;
@@ -2048,8 +2229,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
break;
// How fast is the prediction quality decaying?
- loop_decay_rate = get_prediction_decay_rate(&cpi->common,
- twopass->stats_in);
+ loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
// We want to know something about the recent past... rather than
// as used elsewhere where we are concerned with decay in prediction
@@ -2061,7 +2241,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Special check for transition or high motion followed by a
// static scene.
- if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i,
+ if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
loop_decay_rate, decay_accumulator))
break;
@@ -2091,7 +2271,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Reset to the start of the group.
reset_fpf_position(twopass, start_position);
- kf_group_err = 0;
+ kf_group_err = 0.0;
// Rescan to get the correct error data for the forced kf group.
for (i = 0; i < rc->frames_to_key; ++i) {
@@ -2160,7 +2340,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Monitor for static sections.
zero_motion_accumulator =
MIN(zero_motion_accumulator,
- get_zero_motion_factor(&cpi->common, &next_frame));
+ get_zero_motion_factor(cpi, &next_frame));
// Not all frames in the group are necessarily used in calculating boost.
if ((i <= rc->max_gf_interval) ||
@@ -2171,7 +2351,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// How fast is prediction quality decaying.
if (!detect_flash(twopass, 0)) {
const double loop_decay_rate =
- get_prediction_decay_rate(&cpi->common, &next_frame);
+ get_prediction_decay_rate(cpi, &next_frame);
decay_accumulator *= loop_decay_rate;
decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
av_decay_accumulator += decay_accumulator;
@@ -2201,6 +2381,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
rc->kf_boost, twopass->kf_group_bits);
+ // Work out the fraction of the kf group bits reserved for the inter frames
+ // within the group after discounting the bits for the kf itself.
+ if (twopass->kf_group_bits) {
+ twopass->kfgroup_inter_fraction =
+ (double)(twopass->kf_group_bits - kf_bits) /
+ (double)twopass->kf_group_bits;
+ } else {
+ twopass->kfgroup_inter_fraction = 1.0;
+ }
+
twopass->kf_group_bits -= kf_bits;
// Save the bits to spend on the key frame.
@@ -2215,35 +2405,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// The count of bits left is adjusted elsewhere based on real coded frame
// sizes.
twopass->modified_error_left -= kf_group_err;
-}
-#define VBR_PCT_ADJUSTMENT_LIMIT 50
-// For VBR...adjustment to the frame target based on error from previous frames
-void vbr_rate_correction(VP9_COMP *cpi,
- int * this_frame_target,
- const int64_t vbr_bits_off_target) {
- int max_delta;
- double position_factor = 1.0;
-
- // How far through the clip are we.
- // This number is used to damp the per frame rate correction.
- // Range 0 - 1.0
- if (cpi->twopass.total_stats.count) {
- position_factor = sqrt((double)cpi->common.current_video_frame /
- cpi->twopass.total_stats.count);
- }
- max_delta = (int)(position_factor *
- ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
-
- // vbr_bits_off_target > 0 means we have extra bits to spend
- if (vbr_bits_off_target > 0) {
- *this_frame_target +=
- (vbr_bits_off_target > max_delta) ? max_delta
- : (int)vbr_bits_off_target;
- } else {
- *this_frame_target -=
- (vbr_bits_off_target < -max_delta) ? max_delta
- : (int)-vbr_bits_off_target;
+ if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+ // Default to normal-sized frame on keyframes.
+ cpi->rc.next_frame_size_selector = UNSCALED;
}
}
@@ -2295,6 +2460,24 @@ void configure_buffer_updates(VP9_COMP *cpi) {
}
}
+int is_skippable_frame(const VP9_COMP *cpi) {
+ // If the current frame does not have non-zero motion vector detected in the
+ // first pass, and so do its previous and forward frames, then this frame
+ // can be skipped for partition check, and the partition size is assigned
+ // according to the variance
+ const SVC *const svc = &cpi->svc;
+ const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+ &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+
+ return (!frame_is_intra_only(&cpi->common) &&
+ twopass->stats_in - 2 > twopass->stats_in_start &&
+ twopass->stats_in < twopass->stats_in_end &&
+ (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+ == 1 &&
+ (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+ == 1 &&
+ twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
@@ -2303,7 +2486,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
GF_GROUP *const gf_group = &twopass->gf_group;
int frames_left;
FIRSTPASS_STATS this_frame;
- FIRSTPASS_STATS this_frame_copy;
int target_rate;
LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
@@ -2329,11 +2511,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
rc->base_frame_target = target_rate;
- // Correction to rate target based on prior over or under shoot.
- if (cpi->oxcf.rc_mode == VPX_VBR)
- vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
-
- vp9_rc_set_frame_target(cpi, target_rate);
cm->frame_type = INTER_FRAME;
if (lc != NULL) {
@@ -2347,6 +2524,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
}
}
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip &&
+ cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
return;
}
@@ -2359,9 +2543,14 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
// Special case code for first frame.
const int section_target_bandwidth = (int)(twopass->bits_left /
frames_left);
- const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats,
- section_target_bandwidth);
+ const double section_error =
+ twopass->total_left_stats.coded_error / twopass->total_left_stats.count;
+ const int tmp_q =
+ get_twopass_worst_quality(cpi, section_error,
+ section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
twopass->active_worst_quality = tmp_q;
+ twopass->baseline_active_worst_quality = tmp_q;
rc->ni_av_qi = tmp_q;
rc->last_q[INTER_FRAME] = tmp_q;
rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
@@ -2373,14 +2562,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
if (EOF == input_stats(twopass, &this_frame))
return;
- // Local copy of the current frame's first pass stats.
- this_frame_copy = this_frame;
-
// Keyframe and section processing.
- if (rc->frames_to_key == 0 ||
- (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
// Define next KF group and assign bits to it.
- find_next_key_frame(cpi, &this_frame_copy);
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
} else {
cm->frame_type = INTER_FRAME;
}
@@ -2409,7 +2597,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
// Define a new GF/ARF group. (Should always enter here for key frames).
if (rc->frames_till_gf_update_due == 0) {
- define_gf_group(cpi, &this_frame_copy);
+ define_gf_group(cpi, &this_frame);
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
if (lc != NULL)
@@ -2431,6 +2619,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
configure_buffer_updates(cpi);
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
target_rate = gf_group->bit_allocation[gf_group->index];
if (cpi->common.frame_type == KEY_FRAME)
target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
@@ -2439,18 +2634,21 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
rc->base_frame_target = target_rate;
- // Correction to rate target based on prior over or under shoot.
- if (cpi->oxcf.rc_mode == VPX_VBR)
- vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
-
- vp9_rc_set_frame_target(cpi, target_rate);
+ {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs : cpi->common.MBs;
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy =
+ log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+ }
// Update the total stats remaining structure.
subtract_stats(&twopass->total_left_stats, &this_frame);
}
-#define MINQ_ADJ_LIMIT 32
-#define Q_LIMIT_STEP 1
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
void vp9_twopass_postencode_update(VP9_COMP *cpi) {
TWO_PASS *const twopass = &cpi->twopass;
RATE_CONTROL *const rc = &cpi->rc;
@@ -2483,31 +2681,39 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
// Increment the gf group index ready for the next frame.
++twopass->gf_group.index;
- // If the rate control is drifting consider adjustment ot min or maxq.
- // Only make adjustments on gf/arf
- if ((cpi->oxcf.rc_mode == VPX_VBR) &&
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((cpi->oxcf.rc_mode != VPX_Q) &&
(cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
!cpi->rc.is_src_frame_alt_ref) {
const int maxq_adj_limit =
rc->worst_quality - twopass->active_worst_quality;
+ const int minq_adj_limit =
+ (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
// Undershoot.
if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
--twopass->extend_maxq;
if (rc->rolling_target_bits >= rc->rolling_actual_bits)
- twopass->extend_minq += Q_LIMIT_STEP;
+ ++twopass->extend_minq;
// Overshoot.
} else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
--twopass->extend_minq;
if (rc->rolling_target_bits < rc->rolling_actual_bits)
- twopass->extend_maxq += Q_LIMIT_STEP;
+ ++twopass->extend_maxq;
} else {
+ // Adjustment for extreme local overshoot.
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+
+ // Unwind undershoot or overshoot adjustment.
if (rc->rolling_target_bits < rc->rolling_actual_bits)
--twopass->extend_minq;
- if (rc->rolling_target_bits > rc->rolling_actual_bits)
+ else if (rc->rolling_target_bits > rc->rolling_actual_bits)
--twopass->extend_maxq;
}
- twopass->extend_minq = clamp(twopass->extend_minq, 0, MINQ_ADJ_LIMIT);
+
+ twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
index e21d86928da..08e7a8bf114 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -43,6 +43,7 @@ typedef struct {
typedef struct {
double frame;
+ double weight;
double intra_error;
double coded_error;
double sr_coded_error;
@@ -95,6 +96,7 @@ typedef struct {
double modified_error_min;
double modified_error_max;
double modified_error_left;
+ double mb_av_energy;
#if CONFIG_FP_MB_STATS
uint8_t *frame_mb_stats_buf;
@@ -107,12 +109,17 @@ typedef struct {
// Error score of frames still to be coded in kf group
int64_t kf_group_error_left;
+
+ // The fraction for a kf groups total bits allocated to the inter frames
+ double kfgroup_inter_fraction;
+
int sr_update_lag;
int kf_zeromotion_pct;
int last_kfgroup_zeromotion_pct;
int gf_zeromotion_pct;
int active_worst_quality;
+ int baseline_active_worst_quality;
int extend_minq;
int extend_maxq;
@@ -131,6 +138,13 @@ void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
// Post encode update of the rate control parameters for 2-pass
void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+
+void vp9_init_subsampling(struct VP9_COMP *cpi);
+
+void calculate_coded_size(struct VP9_COMP *cpi,
+ int *scaled_frame_width,
+ int *scaled_frame_height);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
index 823e7a16242..b8e2ca88c8c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
@@ -65,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
// Allocate the lookahead structures
ctx = calloc(1, sizeof(*ctx));
if (ctx) {
+ const int legacy_byte_alignment = 0;
unsigned int i;
ctx->max_sz = depth;
ctx->buf = calloc(depth, sizeof(*ctx->buf));
@@ -76,7 +77,8 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS))
+ VP9_ENC_BORDER_IN_PIXELS,
+ legacy_byte_alignment))
goto bail;
}
return ctx;
@@ -88,19 +90,40 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
#define USE_PARTIAL_COPY 0
int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
- int64_t ts_start, int64_t ts_end, unsigned int flags) {
+ int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ unsigned int flags) {
struct lookahead_entry *buf;
#if USE_PARTIAL_COPY
int row, col, active_end;
int mb_rows = (src->y_height + 15) >> 4;
int mb_cols = (src->y_width + 15) >> 4;
#endif
+ int width = src->y_crop_width;
+ int height = src->y_crop_height;
+ int uv_width = src->uv_crop_width;
+ int uv_height = src->uv_crop_height;
+ int subsampling_x = src->subsampling_x;
+ int subsampling_y = src->subsampling_y;
+ int larger_dimensions, new_dimensions;
if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz)
return 1;
ctx->sz++;
buf = pop(ctx, &ctx->write_idx);
+ new_dimensions = width != buf->img.y_crop_width ||
+ height != buf->img.y_crop_height ||
+ uv_width != buf->img.uv_crop_width ||
+ uv_height != buf->img.uv_crop_height;
+ larger_dimensions = width > buf->img.y_width ||
+ height > buf->img.y_height ||
+ uv_width > buf->img.uv_width ||
+ uv_height > buf->img.uv_height;
+ assert(!larger_dimensions || new_dimensions);
+
#if USE_PARTIAL_COPY
// TODO(jkoleszar): This is disabled for now, as
// vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
@@ -109,7 +132,7 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
// 1. Lookahead queue has has size of 1.
// 2. Active map is provided.
// 3. This is not a key frame, golden nor altref frame.
- if (ctx->max_sz == 1 && active_map && !flags) {
+ if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
for (row = 0; row < mb_rows; ++row) {
col = 0;
@@ -145,11 +168,32 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
active_map += mb_cols;
}
} else {
+#endif
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (vp9_alloc_frame_buffer(&new_img,
+ width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS,
+ 0))
+ return 1;
+ vp9_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
+ }
+ // Partial copy not implemented yet
vp9_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
}
-#else
- // Partial copy not implemented yet
- vp9_copy_and_extend_frame(src, &buf->img);
#endif
buf->ts_start = ts_start;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
index a33d3002e5b..13820380ff4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
@@ -79,7 +79,11 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
* \param[in] active_map Map that specifies which macroblock is active
*/
int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
- int64_t ts_start, int64_t ts_end, unsigned int flags);
+ int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ unsigned int flags);
/**\brief Get the next source buffer to encode
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
index bd04c56a47d..d5eeb9cc546 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -10,6 +10,9 @@
#include <limits.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
#include "vpx_mem/vpx_mem.h"
#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/encoder/vp9_mcomp.h"
@@ -24,7 +27,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
MV *dst_mv,
int mb_row,
int mb_col) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -63,8 +66,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
&distortion, &sse, NULL, 0, 0);
}
- xd->mi[0].src_mi->mbmi.mode = NEWMV;
- xd->mi[0].src_mi->mbmi.mv[0].as_mv = *dst_mv;
+ xd->mi[0]->mbmi.mode = NEWMV;
+ xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
@@ -74,20 +77,20 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
x->mv_row_min = tmp_row_min;
x->mv_row_max = tmp_row_max;
- return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
- xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+ return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
}
static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
int_mv *dst_mv, int mb_row, int mb_col) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
unsigned int err, tmp_err;
MV tmp_mv;
// Try zero MV first
// FIXME should really use something like near/nearest MV and/or MV prediction
- err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
dst_mv->as_int = 0;
@@ -117,13 +120,13 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
}
static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
unsigned int err;
// Try zero MV first
// FIXME should really use something like near/nearest MV and/or MV prediction
- err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
dst_mv->as_int = 0;
@@ -131,7 +134,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
return err;
}
static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
PREDICTION_MODE best_mode = -1, mode;
unsigned int best_err = INT_MAX;
@@ -141,12 +144,12 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
unsigned int err;
- xd->mi[0].src_mi->mbmi.mode = mode;
+ xd->mi[0]->mbmi.mode = mode;
vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride,
0, 0, 0);
- err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride);
// find best
@@ -174,7 +177,7 @@ static void update_mbgraph_mb_stats
int mb_row,
int mb_col
) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
int intra_error;
VP9_COMMON *cm = &cpi->common;
@@ -229,7 +232,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
YV12_BUFFER_CONFIG *buf,
YV12_BUFFER_CONFIG *golden_ref,
YV12_BUFFER_CONFIG *alt_ref) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
VP9_COMMON *const cm = &cpi->common;
@@ -247,7 +250,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
xd->plane[0].dst.stride = buf->y_stride;
xd->plane[0].pre[0].stride = buf->y_stride;
xd->plane[1].dst.stride = buf->uv_stride;
- xd->mi[0].src_mi = &mi_local;
+ xd->mi[0] = &mi_local;
mi_local.mbmi.sb_type = BLOCK_16X16;
mi_local.mbmi.ref_frame[0] = LAST_FRAME;
mi_local.mbmi.ref_frame[1] = NONE;
@@ -376,6 +379,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ assert(golden_ref != NULL);
+
// we need to look ahead beyond where the ARF transitions into
// being a GF - so exit if we don't look ahead beyond that
if (n_frames <= cpi->rc.frames_till_gf_update_due)
@@ -387,9 +392,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
cpi->mbgraph_n_frames = n_frames;
for (i = 0; i < n_frames; i++) {
MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
- vpx_memset(frame_stats->mb_stats, 0,
- cm->mb_rows * cm->mb_cols *
- sizeof(*cpi->mbgraph_stats[i].mb_stats));
+ memset(frame_stats->mb_stats, 0,
+ cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
}
// do motion search to find contribution of each reference to data
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
index 69b4193840d..80c509a1b49 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -90,13 +90,10 @@ static int mv_err_cost(const MV *mv, const MV *ref,
static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
int error_per_bit) {
- if (x->nmvsadcost) {
- const MV diff = { mv->row - ref->row,
- mv->col - ref->col };
- return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
- x->nmvsadcost) * error_per_bit, 8);
- }
- return 0;
+ const MV diff = { mv->row - ref->row,
+ mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
+ x->nmvsadcost) * error_per_bit, 8);
}
void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
@@ -286,42 +283,53 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
bestmv->row *= 8; \
bestmv->col *= 8;
+static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
+ const MV *bestmv,
+ const MV *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ const uint8_t *const src,
+ const int src_stride,
+ const uint8_t *const y,
+ int y_stride,
+ const uint8_t *second_pred,
+ int w, int h, int offset,
+ int *mvjcost, int *mvcost[2],
+ unsigned int *sse1,
+ int *distortion) {
+ unsigned int besterr;
#if CONFIG_VP9_HIGHBITDEPTH
-#define SETUP_CENTER_ERROR \
- if (second_pred != NULL) { \
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { \
- DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64); \
- vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, \
- y_stride); \
- besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride, \
- sse1); \
- } else { \
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
- besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
- } \
- } else { \
- besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
- } \
- *distortion = besterr; \
+ if (second_pred != NULL) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+ vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+ y_stride);
+ besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
+ sse1);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ }
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
#else
-
-#define SETUP_CENTER_ERROR \
- if (second_pred != NULL) { \
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
- besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
- } else { \
- besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
- } \
- *distortion = besterr; \
+ (void) xd;
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
#endif // CONFIG_VP9_HIGHBITDEPTH
-
-
-
+ return besterr;
+}
static INLINE int divide_and_round(const int n, const int d) {
return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
@@ -365,7 +373,10 @@ int vp9_find_best_sub_pixel_tree_pruned_evenmore(
const uint8_t *second_pred,
int w, int h) {
SETUP_SUBPEL_SEARCH;
- SETUP_CENTER_ERROR;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ z, src_stride, y, y_stride, second_pred,
+ w, h, offset, mvjcost, mvcost,
+ sse1, distortion);
(void) halfiters;
(void) quarteriters;
(void) eighthiters;
@@ -441,7 +452,10 @@ int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
const uint8_t *second_pred,
int w, int h) {
SETUP_SUBPEL_SEARCH;
- SETUP_CENTER_ERROR;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ z, src_stride, y, y_stride, second_pred,
+ w, h, offset, mvjcost, mvcost,
+ sse1, distortion);
if (cost_list &&
cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -512,7 +526,10 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
const uint8_t *second_pred,
int w, int h) {
SETUP_SUBPEL_SEARCH;
- SETUP_CENTER_ERROR;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ z, src_stride, y, y_stride, second_pred,
+ w, h, offset, mvjcost, mvcost,
+ sse1, distortion);
if (cost_list &&
cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -590,6 +607,13 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
return besterr;
}
+const MV search_step_table[12] = {
+ // left, right, up, down
+ {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+ {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+ {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
int allow_hp,
@@ -603,43 +627,129 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
unsigned int *sse1,
const uint8_t *second_pred,
int w, int h) {
- SETUP_SUBPEL_SEARCH;
- SETUP_CENTER_ERROR;
+ const uint8_t *const z = x->plane[0].src.buf;
+ const uint8_t *const src_address = z;
+ const int src_stride = x->plane[0].src.stride;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ unsigned int whichdir = 0;
+ int thismse;
+ const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter, round = 3 - forced_stop;
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+
+ if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
+ if (round == 3)
+ round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ z, src_stride, y, y_stride, second_pred,
+ w, h, offset, mvjcost, mvcost,
+ sse1, distortion);
+
(void) cost_list; // to silence compiler warning
- // Each subsequent iteration checks at least one point in
- // common with the last iteration could be 2 ( if diag selected)
- // 1/2 pel
- FIRST_LEVEL_CHECKS;
- if (halfiters > 1) {
- SECOND_LEVEL_CHECKS;
- }
- tr = br;
- tc = bc;
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+ int row_offset = (tr & 0x07) << 1;
+ int col_offset = (tc & 0x07) << 1;
+ MV this_mv;
+ this_mv.row = tr;
+ this_mv.col = tc;
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+ src_address, src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+ src_address, src_stride, &sse, second_pred);
+ cost_array[idx] = thismse +
+ mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+ }
- // Each subsequent iteration checks at least one point in common with
- // the last iteration could be 2 ( if diag selected) 1/4 pel
+ // Check diagonal sub-pixel position
+ tc = bc + (cost_array[0] < cost_array[1] ? -hstep : hstep);
+ tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep);
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+ int row_offset = (tr & 0x07) << 1;
+ int col_offset = (tc & 0x07) << 1;
+ MV this_mv = {tr, tc};
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+ src_address, src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+ src_address, src_stride, &sse, second_pred);
+ cost_array[4] = thismse +
+ mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
- // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
- if (forced_stop != 2) {
- hstep >>= 1;
- FIRST_LEVEL_CHECKS;
- if (quarteriters > 1) {
- SECOND_LEVEL_CHECKS;
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
}
- tr = br;
- tc = bc;
- }
- if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
- hstep >>= 1;
- FIRST_LEVEL_CHECKS;
- if (eighthiters > 1) {
+ if (iters_per_step > 1)
SECOND_LEVEL_CHECKS;
- }
+
tr = br;
tc = bc;
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
}
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
// These lines insure static analysis doesn't warn that
// tr and tc aren't used after the above point.
(void) tr;
@@ -1604,6 +1714,184 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
return bestsad;
}
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+ int best_sad = INT_MAX;
+ int this_sad;
+ int d;
+ int center, offset = 0;
+ int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
+ for (d = 0; d <= bw; d += 16) {
+ this_sad = vp9_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+
+ for (d = -8; d <= 8; d += 16) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw)
+ continue;
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -4; d <= 4; d += 8) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw)
+ continue;
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -2; d <= 2; d += 4) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw)
+ continue;
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -1; d <= 1; d += 2) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw)
+ continue;
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+
+ return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+ {-1, 0}, {0, -1}, {0, 1}, {1, 0},
+};
+
+unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+ DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+ DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+ DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+ int idx;
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int search_width = bw << 1;
+ const int search_height = bh << 1;
+ const int src_stride = x->plane[0].src.stride;
+ const int ref_stride = xd->plane[0].pre[0].stride;
+ uint8_t const *ref_buf, *src_buf;
+ MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+ unsigned int best_sad, tmp_sad, this_sad[4];
+ MV this_mv;
+ const int norm_factor = 3 + (bw >> 5);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ tmp_mv->row = 0;
+ tmp_mv->col = 0;
+ return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+#endif
+
+ // Set up prediction 1-D reference set
+ ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+ for (idx = 0; idx < search_width; idx += 16) {
+ vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+ ref_buf += 16;
+ }
+
+ ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+ for (idx = 0; idx < search_height; ++idx) {
+ vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;
+ ref_buf += ref_stride;
+ }
+
+ // Set up src 1-D reference set
+ for (idx = 0; idx < bw; idx += 16) {
+ src_buf = x->plane[0].src.buf + idx;
+ vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+ }
+
+ src_buf = x->plane[0].src.buf;
+ for (idx = 0; idx < bh; ++idx) {
+ src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;
+ src_buf += src_stride;
+ }
+
+ // Find the best match per 1-D search
+ tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+ tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+ this_mv = *tmp_mv;
+ src_buf = x->plane[0].src.buf;
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+ best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+ {
+ const uint8_t * const pos[4] = {
+ ref_buf - ref_stride,
+ ref_buf - 1,
+ ref_buf + 1,
+ ref_buf + ref_stride,
+ };
+
+ cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+ }
+
+ for (idx = 0; idx < 4; ++idx) {
+ if (this_sad[idx] < best_sad) {
+ best_sad = this_sad[idx];
+ tmp_mv->row = search_pos[idx].row + this_mv.row;
+ tmp_mv->col = search_pos[idx].col + this_mv.col;
+ }
+ }
+
+ if (this_sad[0] < this_sad[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
+
+ if (this_sad[1] < this_sad[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
+
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+ tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+ ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ *tmp_mv = this_mv;
+ best_sad = tmp_sad;
+ }
+
+ tmp_mv->row *= 8;
+ tmp_mv->col *= 8;
+
+ return best_sad;
+}
+
/* do_refine: If last step (1-away) of n-step search doesn't pick the center
point as the best match, we will do a final 1-away diamond
refining search */
@@ -1654,7 +1942,7 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
if (do_refine) {
const int search_range = 8;
MV best_mv = *dst_mv;
- thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
+ thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
fn_ptr, ref_mv);
if (thissme < INT_MAX)
thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
@@ -1729,7 +2017,7 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
if (fn_ptr->sdx3f != NULL) {
while ((c + 2) < col_max) {
int i;
- unsigned int sads[3];
+ DECLARE_ALIGNED(16, uint32_t, sads[3]);
fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
sads);
@@ -1794,7 +2082,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
if (fn_ptr->sdx8f != NULL) {
while ((c + 7) < col_max) {
int i;
- unsigned int sads[8];
+ DECLARE_ALIGNED(16, uint32_t, sads[8]);
fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
sads);
@@ -1818,7 +2106,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
if (fn_ptr->sdx3f != NULL) {
while ((c + 2) < col_max) {
int i;
- unsigned int sads[3];
+ DECLARE_ALIGNED(16, uint32_t, sads[3]);
fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
sads);
@@ -1858,11 +2146,11 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
return best_sad;
}
-int vp9_refining_search_sad_c(const MACROBLOCK *x,
- MV *ref_mv, int error_per_bit,
- int search_range,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv) {
+int vp9_refining_search_sad(const MACROBLOCK *x,
+ MV *ref_mv, int error_per_bit,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
const struct buf_2d *const what = &x->plane[0].src;
@@ -2029,7 +2317,7 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
1, cost_list, fn_ptr, ref_mv, tmp_mv);
break;
default:
- assert(!"Invalid search method.");
+ assert(0 && "Invalid search method.");
}
if (method != NSTEP && rd && var < var_max)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
index 9ddca250c7c..dd8a4607942 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
@@ -66,7 +66,13 @@ struct SPEED_FEATURES;
int vp9_init_search_range(int size);
-// Runs sequence of diamond searches in smaller steps for RD
+int vp9_refining_search_sad(const struct macroblock *x,
+ struct mv *ref_mv,
+ int sad_per_bit, int distance,
+ const struct vp9_variance_vtable *fn_ptr,
+ const struct mv *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param,
int sadpb, int further_steps, int do_refine,
@@ -74,6 +80,11 @@ int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv);
+// Perform integral projection based motion estimation.
+unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
+ MACROBLOCK *x,
+ BLOCK_SIZE bsize);
+
typedef int (integer_mv_pattern_search_fn) (
const MACROBLOCK *x,
MV *ref_mv,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
index 85984fd7ef6..5eb5d542b78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
@@ -33,16 +33,23 @@ static int get_max_filter_level(const VP9_COMP *cpi) {
}
-static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
- int filt_level, int partial_frame) {
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+ VP9_COMP *const cpi,
+ int filt_level, int partial_frame) {
VP9_COMMON *const cm = &cpi->common;
- int filt_err;
+ int64_t filt_err;
+
+ if (cpi->num_workers > 1)
+ vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+ filt_level, 1, partial_frame,
+ cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+ else
+ vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+ 1, partial_frame);
- vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
- partial_frame);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
- filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);
+ filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
} else {
filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
}
@@ -63,17 +70,18 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
const int min_filter_level = 0;
const int max_filter_level = get_max_filter_level(cpi);
int filt_direction = 0;
- int best_err, filt_best;
+ int64_t best_err;
+ int filt_best;
// Start the search at the previous frame filter level unless it is now out of
// range.
int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
// Sum squared error at each filter level
- int ss_err[MAX_LOOP_FILTER + 1];
+ int64_t ss_err[MAX_LOOP_FILTER + 1];
// Set each entry to -1
- vpx_memset(ss_err, 0xFF, sizeof(ss_err));
+ memset(ss_err, 0xFF, sizeof(ss_err));
// Make a copy of the unfiltered / processed recon buffer
vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -87,7 +95,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
// Bias against raising loop filter in favor of lowering it.
- int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+ int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
bias = (bias * cpi->twopass.section_intra_rating) / 20;
@@ -153,7 +161,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
// These values were determined by linear fitting the result of the
// searched level, filt_guess = q * 0.316206 + 3.87252
-#if CONFIG_VP9_HIGHDEPTH
+#if CONFIG_VP9_HIGHBITDEPTH
int filt_guess;
switch (cm->bit_depth) {
case VPX_BITS_8:
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
index b74b2dd56a2..9fb7cfba7bf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -20,9 +20,11 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/encoder/vp9_cost.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -49,7 +51,7 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
int const_motion = 0;
// Blank the reference vector list
- vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+ memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
// The nearest 2 blocks are treated differently
// if the size < 8x8 we get the mv from the bmi substructure,
@@ -58,14 +60,15 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const POSITION *const mv_ref = &mv_ref_search[i];
if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
- xd->mi_stride].src_mi;
+ xd->mi_stride];
const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
// Keep counts for entropy encoding.
context_counter += mode_2_counter[candidate->mode];
different_ref_found = 1;
if (candidate->ref_frame[0] == ref_frame)
- ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1));
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
+ refmv_count, mv_ref_list, Done);
}
}
@@ -78,11 +81,11 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const POSITION *const mv_ref = &mv_ref_search[i];
if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
- xd->mi_stride].src_mi->mbmi;
+ xd->mi_stride]->mbmi;
different_ref_found = 1;
if (candidate->ref_frame[0] == ref_frame)
- ADD_MV_REF_LIST(candidate->mv[0]);
+ ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
}
}
@@ -94,10 +97,11 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const POSITION *mv_ref = &mv_ref_search[i];
if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
- * xd->mi_stride].src_mi->mbmi;
+ * xd->mi_stride]->mbmi;
// If the candidate is INTRA we don't want to consider its mv.
- IF_DIFF_REF_FRAME_ADD_MV(candidate);
+ IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+ refmv_count, mv_ref_list, Done);
}
}
}
@@ -118,7 +122,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *tmp_mv, int *rate_mv,
int64_t best_rd_sofar) {
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
const int step_param = cpi->sf.mv.fullpel_search_step_param;
const int sadpb = x->sadperbit16;
@@ -135,10 +139,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int cost_list[5];
const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
ref);
- if (cpi->common.show_frame &&
- (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
- return rv;
-
if (scaled_ref_frame) {
int i;
// Swap out the reference frame for a version that's been scaled to
@@ -190,7 +190,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
- x->pred_mv[ref] = tmp_mv->as_mv;
+ *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
if (scaled_ref_frame) {
@@ -201,6 +202,250 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
return rv;
}
+static void block_variance(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, unsigned int *sse, int *sum,
+ int block_size, unsigned int *sse8x8,
+ int *sum8x8, unsigned int *var8x8) {
+ int i, j, k = 0;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ vp9_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride,
+ &sse8x8[k], &sum8x8[k]);
+ *sse += sse8x8[k];
+ *sum += sum8x8[k];
+ var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+ k++;
+ }
+ }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+ unsigned int *sse_i, int *sum_i,
+ unsigned int *var_o, unsigned int *sse_o,
+ int *sum_o) {
+ const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+ const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+ const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+ int i, j, k = 0;
+
+ for (i = 0; i < nh; i += 2) {
+ for (j = 0; j < nw; j += 2) {
+ sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+ sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+ sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+ sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+ var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+ (b_width_log2_lookup[unit_size] +
+ b_height_log2_lookup[unit_size] + 6));
+ k++;
+ }
+ }
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ unsigned int *var_y, unsigned int *sse_y,
+ int mi_row, int mi_col, int *early_term) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const uint32_t dc_quant = pd->dequant[0];
+ const uint32_t ac_quant = pd->dequant[1];
+ const int64_t dc_thr = dc_quant * dc_quant >> 6;
+ const int64_t ac_thr = ac_quant * ac_quant >> 6;
+ unsigned int var;
+ int sum;
+ int skip_dc = 0;
+
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ const int num8x8 = 1 << (bw + bh - 2);
+ unsigned int sse8x8[64] = {0};
+ int sum8x8[64] = {0};
+ unsigned int var8x8[64] = {0};
+ TX_SIZE tx_size;
+ int i, k;
+
+ // Calculate variance for whole partition, and also save 8x8 blocks' variance
+ // to be used in following transform skipping test.
+ block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+ var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));
+
+ *var_y = var;
+ *sse_y = sse;
+
+ if (cpi->common.tx_mode == TX_MODE_SELECT) {
+ if (sse > (var << 2))
+ tx_size = MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ else
+ tx_size = TX_8X8;
+
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+ tx_size = TX_8X8;
+ else if (tx_size > TX_16X16)
+ tx_size = TX_16X16;
+ }
+ } else {
+ tx_size = MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ }
+
+ assert(tx_size >= TX_8X8);
+ xd->mi[0]->mbmi.tx_size = tx_size;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ {
+ unsigned int sse16x16[16] = {0};
+ int sum16x16[16] = {0};
+ unsigned int var16x16[16] = {0};
+ const int num16x16 = num8x8 >> 2;
+
+ unsigned int sse32x32[4] = {0};
+ int sum32x32[4] = {0};
+ unsigned int var32x32[4] = {0};
+ const int num32x32 = num8x8 >> 4;
+
+ int ac_test = 1;
+ int dc_test = 1;
+ const int num = (tx_size == TX_8X8) ? num8x8 :
+ ((tx_size == TX_16X16) ? num16x16 : num32x32);
+ const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
+ ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+ const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
+ ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+ // Calculate variance if tx_size > TX_8X8
+ if (tx_size >= TX_16X16)
+ calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+ sum16x16);
+ if (tx_size == TX_32X32)
+ calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+ sse32x32, sum32x32);
+
+ // Skipping test
+ x->skip_txfm[0] = 0;
+ for (k = 0; k < num; k++)
+ // Check if all ac coefficients can be quantized to zero.
+ if (!(var_tx[k] < ac_thr || var == 0)) {
+ ac_test = 0;
+ break;
+ }
+
+ for (k = 0; k < num; k++)
+ // Check if dc coefficient can be quantized to zero.
+ if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+ dc_test = 0;
+ break;
+ }
+
+ if (ac_test) {
+ x->skip_txfm[0] = 2;
+
+ if (dc_test)
+ x->skip_txfm[0] = 1;
+ } else if (dc_test) {
+ skip_dc = 1;
+ }
+ }
+
+ if (x->skip_txfm[0] == 1) {
+ int skip_uv[2] = {0};
+ unsigned int var_uv[2];
+ unsigned int sse_uv[2];
+
+ *out_rate_sum = 0;
+ *out_dist_sum = sse << 4;
+
+ // Transform skipping test in UV planes.
+ for (i = 1; i <= 2; i++) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0]->mbmi, pd);
+ const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+ const int uv_bw = b_width_log2_lookup[uv_bsize];
+ const int uv_bh = b_height_log2_lookup[uv_bsize];
+ const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+ (uv_bh - b_height_log2_lookup[unit_size]);
+ const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+ const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+ int j = i - 1;
+
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+ var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+ if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+ (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+ skip_uv[j] = 1;
+ else
+ break;
+ }
+
+ // If the transform in YUV planes are skippable, the mode search checks
+ // fewer inter modes and doesn't check intra modes.
+ if (skip_uv[0] & skip_uv[1]) {
+ *early_term = 1;
+ }
+
+ return;
+ }
+
+ if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+ }
+#else
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ if (!skip_dc) {
+ *out_rate_sum = rate >> 1;
+ *out_dist_sum = dist << 3;
+ } else {
+ *out_rate_sum = 0;
+ *out_dist_sum = (sse - var) << 4;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> 3, &rate, &dist);
+ }
+#else
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *out_rate_sum += rate;
+ *out_dist_sum += dist << 4;
+}
static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
@@ -214,79 +459,298 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
int64_t dist;
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &xd->plane[0];
+ const int64_t dc_thr = p->quant_thred[0] >> 6;
+ const int64_t ac_thr = p->quant_thred[1] >> 6;
const uint32_t dc_quant = pd->dequant[0];
const uint32_t ac_quant = pd->dequant[1];
unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride, &sse);
+ int skip_dc = 0;
+
*var_y = var;
*sse_y = sse;
- if (sse < dc_quant * dc_quant >> 6)
- x->skip_txfm[0] = 1;
- else if (var < ac_quant * ac_quant >> 6)
- x->skip_txfm[0] = 2;
- else
- x->skip_txfm[0] = 0;
-
if (cpi->common.tx_mode == TX_MODE_SELECT) {
if (sse > (var << 2))
- xd->mi[0].src_mi->mbmi.tx_size =
+ xd->mi[0]->mbmi.tx_size =
MIN(max_txsize_lookup[bsize],
tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
else
- xd->mi[0].src_mi->mbmi.tx_size = TX_8X8;
-
- if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
- xd->mi[0].src_mi->mbmi.tx_size > TX_16X16)
- xd->mi[0].src_mi->mbmi.tx_size = TX_16X16;
+ xd->mi[0]->mbmi.tx_size = TX_8X8;
+
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+ xd->mi[0]->mbmi.tx_size = TX_8X8;
+ else if (xd->mi[0]->mbmi.tx_size > TX_16X16)
+ xd->mi[0]->mbmi.tx_size = TX_16X16;
+ }
} else {
- xd->mi[0].src_mi->mbmi.tx_size =
+ xd->mi[0]->mbmi.tx_size =
MIN(max_txsize_lookup[bsize],
tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
}
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
- dc_quant >> (xd->bd - 5), &rate, &dist);
- } else {
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
- dc_quant >> 3, &rate, &dist);
+ // Evaluate if the partition block is a skippable block in Y plane.
+ {
+ const BLOCK_SIZE unit_size =
+ txsize_to_bsize[xd->mi[0]->mbmi.tx_size];
+ const unsigned int num_blk_log2 =
+ (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
+ (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
+ const unsigned int sse_tx = sse >> num_blk_log2;
+ const unsigned int var_tx = var >> num_blk_log2;
+
+ x->skip_txfm[0] = 0;
+ // Check if all ac coefficients can be quantized to zero.
+ if (var_tx < ac_thr || var == 0) {
+ x->skip_txfm[0] = 2;
+ // Check if dc coefficient can be quantized to zero.
+ if (sse_tx - var_tx < dc_thr || sse == var)
+ x->skip_txfm[0] = 1;
+ } else {
+ if (sse_tx - var_tx < dc_thr || sse == var)
+ skip_dc = 1;
+ }
}
+
+ if (x->skip_txfm[0] == 1) {
+ *out_rate_sum = 0;
+ *out_dist_sum = sse << 4;
+ return;
+ }
+
+ if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+ }
#else
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
- dc_quant >> 3, &rate, &dist);
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
- *out_rate_sum = rate >> 1;
- *out_dist_sum = dist << 3;
+ if (!skip_dc) {
+ *out_rate_sum = rate >> 1;
+ *out_dist_sum = dist << 3;
+ } else {
+ *out_rate_sum = 0;
+ *out_dist_sum = (sse - var) << 4;
+ }
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(var,
- 1 << num_pels_log2_lookup[bsize],
- ac_quant >> (xd->bd - 5),
- &rate,
- &dist);
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(var,
- 1 << num_pels_log2_lookup[bsize],
- ac_quant >> 3,
- &rate,
- &dist);
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(var,
- 1 << num_pels_log2_lookup[bsize],
- ac_quant >> 3,
- &rate,
- &dist);
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
*out_rate_sum += rate;
*out_dist_sum += dist << 4;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+ int *skippable, int64_t *sse, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int var_y, sse_y;
+ (void)plane;
+ (void)tx_size;
+ model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+ *sse = INT_MAX;
+ *skippable = 0;
+ return;
+}
+#else
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+ int *skippable, int64_t *sse, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ int block = 0, r, c;
+ int shift = tx_size == TX_32X32 ? 0 : 2;
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+ xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+ xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+ int eob_cost = 0;
+
+ (void)cpi;
+ vp9_subtract_plane(x, bsize, plane);
+ *skippable = 1;
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+ vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_16X16:
+ vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+ vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+ vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ *skippable &= (*eob == 0);
+ eob_cost += 1;
+ }
+ block += step;
+ }
+ }
+
+ if (*skippable && *sse < INT64_MAX) {
+ *rate = 0;
+ *dist = (*sse << 6) >> shift;
+ *sse = *dist;
+ return;
+ }
+
+ block = 0;
+ *rate = 0;
+ *dist = 0;
+ *sse = (*sse << 6) >> shift;
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+
+ if (*eob == 1)
+ *rate += (int)abs(qcoeff[0]);
+ else if (*eob > 1)
+ *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+ *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+ }
+ block += step;
+ }
+ }
+
+ if (*skippable == 0) {
+ *rate <<= 10;
+ *rate += (eob_cost << 8);
+ }
+}
+#endif
+
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ unsigned int *var_y, unsigned int *sse_y) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ int i;
+
+ *out_rate_sum = 0;
+ *out_dist_sum = 0;
+
+ for (i = 1; i <= 2; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const uint32_t dc_quant = pd->dequant[0];
+ const uint32_t ac_quant = pd->dequant[1];
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ unsigned int var;
+
+ if (!x->color_sensitivity[i - 1])
+ continue;
+
+ var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse);
+ *var_y += var;
+ *sse_y += sse;
+
+ #if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> 3, &rate, &dist);
+ }
+ #else
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> 3, &rate, &dist);
+ #endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *out_rate_sum += rate >> 1;
+ *out_dist_sum += dist << 3;
+
+ #if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+ ac_quant >> 3, &rate, &dist);
+ }
+ #else
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+ ac_quant >> 3, &rate, &dist);
+ #endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *out_rate_sum += rate;
+ *out_dist_sum += dist << 4;
+ }
+}
+
static int get_pred_buffer(PRED_BUFFER *p, int len) {
int i;
@@ -312,7 +776,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
struct buf_2d yv12_mb[][MAX_MB_PLANE],
int *rate, int64_t *dist) {
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
unsigned int var = var_y, sse = sse_y;
@@ -329,11 +793,11 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
const unsigned int min_thresh =
MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
#if CONFIG_VP9_HIGHBITDEPTH
- const int shift = 2 * xd->bd - 16;
+ const int shift = (xd->bd << 1) - 16;
#endif
// Calculate threshold according to dequant value.
- thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+ thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3;
#if CONFIG_VP9_HIGHBITDEPTH
if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
@@ -375,14 +839,14 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
xd->plane[1].dst.stride, &sse_u);
// U skipping condition checking
- if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+ if (((var_u << 2) <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
x->plane[2].src.stride,
xd->plane[2].dst.buf,
xd->plane[2].dst.stride, &sse_v);
// V skipping condition checking
- if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+ if (((var_v << 2) <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
x->skip = 1;
// The cost of skip bit needs to be added.
@@ -428,7 +892,9 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
int i, j;
int rate;
int64_t dist;
- unsigned int var_y, sse_y;
+ int64_t this_sse = INT64_MAX;
+ int is_skippable;
+
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
assert(plane == 0);
(void) plane;
@@ -439,67 +905,189 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_predict_intra_block(xd, block >> (2 * tx_size),
b_width_log2_lookup[plane_bsize],
tx_size, args->mode,
- p->src.buf, src_stride,
+ x->skip_encode ? p->src.buf : pd->dst.buf,
+ x->skip_encode ? src_stride : dst_stride,
pd->dst.buf, dst_stride,
i, j, 0);
- // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
- model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+
+ // TODO(jingning): This needs further refactoring.
+ block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+ bsize_tx, MIN(tx_size, TX_16X16));
+ x->skip_txfm[0] = is_skippable;
+ rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+
p->src.buf = src_buf_base;
pd->dst.buf = dst_buf_base;
args->rate += rate;
args->dist += dist;
}
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+ {THR_DC, THR_V_PRED, THR_H_PRED, THR_TM},
{THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
{THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
- {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
+};
+
+static const PREDICTION_MODE intra_mode_list[] = {
+ DC_PRED, V_PRED, H_PRED, TM_PRED
+};
+
+static int mode_offset(const PREDICTION_MODE mode) {
+ if (mode >= NEARESTMV) {
+ return INTER_OFFSET(mode);
+ } else {
+ switch (mode) {
+ case DC_PRED:
+ return 0;
+ case V_PRED:
+ return 1;
+ case H_PRED:
+ return 2;
+ case TM_PRED:
+ return 3;
+ default:
+ return -1;
+ }
+ }
+}
+
+static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
+ TileDataEnc *tile_data,
+ BLOCK_SIZE bsize,
+ MV_REFERENCE_FRAME ref_frame,
+ THR_MODES best_mode_idx,
+ PREDICTION_MODE mode) {
+ THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+ int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+ if (thr_mode_idx == best_mode_idx)
+ *freq_fact -= (*freq_fact >> 4);
+ else
+ *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+}
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ RD_COST this_rdc, best_rdc;
+ PREDICTION_MODE this_mode;
+ struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+ const TX_SIZE intra_tx_size =
+ MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ MODE_INFO *const mic = xd->mi[0];
+ int *bmode_costs;
+ const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+ const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+ const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+ const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+ bmode_costs = cpi->y_mode_costs[A][L];
+
+ (void) ctx;
+ vp9_rd_cost_reset(&best_rdc);
+ vp9_rd_cost_reset(&this_rdc);
+
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->mv[0].as_int = INVALID_MV;
+ mbmi->uv_mode = DC_PRED;
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+ // Change the limit of this loop to add other intra prediction
+ // mode tests.
+ for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+ args.mode = this_mode;
+ args.rate = 0;
+ args.dist = 0;
+ mbmi->tx_size = intra_tx_size;
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+ estimate_block_intra, &args);
+ this_rdc.rate = args.rate;
+ this_rdc.dist = args.dist;
+ this_rdc.rate += bmode_costs[this_mode];
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ this_rdc.rate, this_rdc.dist);
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ mbmi->mode = this_mode;
+ }
+ }
+
+ *rd_cost = best_rdc;
+}
+
+static void init_ref_frame_cost(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ int ref_frame_cost[MAX_REF_FRAMES]) {
+ vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+ vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+ vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+ ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+ ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+ ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+ ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame;
+ PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+#define RT_INTER_MODES 8
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+ {LAST_FRAME, ZEROMV},
+ {LAST_FRAME, NEARESTMV},
+ {GOLDEN_FRAME, ZEROMV},
+ {LAST_FRAME, NEARMV},
+ {LAST_FRAME, NEWMV},
+ {GOLDEN_FRAME, NEARESTMV},
+ {GOLDEN_FRAME, NEARMV},
+ {GOLDEN_FRAME, NEWMV}
};
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- const TileInfo *const tile,
- int mi_row, int mi_col,
- int *returnrate,
- int64_t *returndistortion,
- BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx) {
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
struct macroblockd_plane *const pd = &xd->plane[0];
PREDICTION_MODE best_mode = ZEROMV;
MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
- TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
- tx_mode_to_biggest_tx_size[cm->tx_mode]);
+ MV_REFERENCE_FRAME usable_ref_frame;
+ TX_SIZE best_tx_size = TX_SIZES;
INTERP_FILTER best_pred_filter = EIGHTTAP;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
- int64_t best_rd = INT64_MAX;
- int64_t this_rd = INT64_MAX;
- uint8_t skip_txfm = 0;
- int rate = INT_MAX;
- int64_t dist = INT64_MAX;
+ RD_COST this_rdc, best_rdc;
+ uint8_t skip_txfm = 0, best_mode_skip_txfm = 0;
// var_y and sse_y are saved to be used in skipping checking
unsigned int var_y = UINT_MAX;
unsigned int sse_y = UINT_MAX;
// Reduce the intra cost penalty for small blocks (<=16x16).
const int reduction_fac =
(cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
- bsize <= BLOCK_16X16) ? 4 : 1;
+ bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
- const int intra_mode_cost = 50;
-
- const int8_t segment_id = mbmi->segment_id;
- const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
- const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
- INTERP_FILTER filter_ref = cm->interp_filter;
+ const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
+ const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+ INTERP_FILTER filter_ref;
const int bsl = mi_width_log2_lookup[bsize];
const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
(((mi_row + mi_col) >> bsl) +
@@ -511,16 +1099,24 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// process.
// tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
PRED_BUFFER tmp[4];
- DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+ DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]);
#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64);
+ DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
#endif
struct buf_2d orig_dst = pd->dst;
PRED_BUFFER *best_pred = NULL;
PRED_BUFFER *this_mode_pred = NULL;
const int pixels_in_block = bh * bw;
+ int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
+ int ref_frame_skip_mask = 0;
+ int idx;
+ int best_pred_sad = INT_MAX;
+ int best_early_term = 0;
+ int ref_frame_cost[MAX_REF_FRAMES];
+
+ init_ref_frame_cost(cm, xd, ref_frame_cost);
- if (cpi->sf.reuse_inter_pred_sby) {
+ if (reuse_inter_pred) {
int i;
for (i = 0; i < 3; i++) {
#if CONFIG_VP9_HIGHBITDEPTH
@@ -542,42 +1138,52 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
x->skip = 0;
+ if (xd->up_available)
+ filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+ else if (xd->left_available)
+ filter_ref = xd->mi[-1]->mbmi.interp_filter;
+ else
+ filter_ref = cm->interp_filter;
+
// initialize mode decisions
- *returnrate = INT_MAX;
- *returndistortion = INT64_MAX;
- vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
+ vp9_rd_cost_reset(&best_rdc);
+ vp9_rd_cost_reset(rd_cost);
mbmi->sb_type = bsize;
mbmi->ref_frame[0] = NONE;
mbmi->ref_frame[1] = NONE;
mbmi->tx_size = MIN(max_txsize_lookup[bsize],
tx_mode_to_biggest_tx_size[cm->tx_mode]);
- mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
- EIGHTTAP : cm->interp_filter;
- mbmi->segment_id = segment_id;
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- PREDICTION_MODE this_mode;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ vp9_denoiser_reset_frame_stats(ctx);
+#endif
+
+ if (cpi->rc.frames_since_golden == 0) {
+ usable_ref_frame = LAST_FRAME;
+ } else {
+ usable_ref_frame = GOLDEN_FRAME;
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+
x->pred_mv_sad[ref_frame] = INT_MAX;
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
- if (xd->up_available)
- filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
- else if (xd->left_available)
- filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter;
-
- if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
int_mv *const candidates = mbmi->ref_mvs[ref_frame];
const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+
vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
sf, sf);
- if (!cm->error_resilient_mode)
- vp9_find_mv_refs(cm, xd, tile, xd->mi[0].src_mi, ref_frame,
- candidates, mi_row, mi_col);
+ if (cm->use_prev_frame_mvs)
+ vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
+ candidates, mi_row, mi_col, NULL, NULL);
else
- const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0].src_mi,
+ const_motion[ref_frame] = mv_refs_rt(cm, xd, tile_info,
+ xd->mi[0],
ref_frame, candidates,
mi_row, mi_col);
@@ -589,195 +1195,288 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
ref_frame, bsize);
} else {
- continue;
+ ref_frame_skip_mask |= (1 << ref_frame);
}
+ }
- // Select prediction reference frames.
- xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
+ for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+ int rate_mv = 0;
+ int mode_rd_thresh;
+ int mode_index;
+ int i;
+ PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+ int64_t this_sse;
+ int is_skippable;
+ int this_early_term = 0;
+
+ if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
+ continue;
+
+ ref_frame = ref_mode_set[idx].ref_frame;
+ if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+ continue;
+ if (const_motion[ref_frame] && this_mode == NEARMV)
+ continue;
- clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
- clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
+ i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+ if (cpi->ref_frame_flags & flag_list[i])
+ if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+ ref_frame_skip_mask |= (1 << ref_frame);
+ if (ref_frame_skip_mask & (1 << ref_frame))
+ continue;
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
mbmi->ref_frame[0] = ref_frame;
+ set_ref_ptrs(cm, xd, ref_frame, NONE);
- for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
- int rate_mv = 0;
- int mode_rd_thresh;
+ mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+ mode_rd_thresh = best_mode_skip_txfm ?
+ rd_threshes[mode_index] << 1 : rd_threshes[mode_index];
+ if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+ rd_thresh_freq_fact[mode_index]))
+ continue;
- if (const_motion[ref_frame] &&
- (this_mode == NEARMV || this_mode == ZEROMV))
- continue;
+ if (this_mode == NEWMV) {
+ if (ref_frame > LAST_FRAME) {
+ int tmp_sad;
+ int dis, cost_list[5];
- if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
- continue;
+ if (bsize < BLOCK_16X16)
+ continue;
- mode_rd_thresh =
- rd_threshes[mode_idx[ref_frame -
- LAST_FRAME][INTER_OFFSET(this_mode)]];
- if (rd_less_than_thresh(best_rd, mode_rd_thresh,
- rd_thresh_freq_fact[this_mode]))
- continue;
+ tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
- if (this_mode == NEWMV) {
- if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
- this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+ if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
continue;
- if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
- &frame_mv[NEWMV][ref_frame],
- &rate_mv, best_rd))
+ if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
continue;
- }
- if (this_mode != NEARESTMV &&
- frame_mv[this_mode][ref_frame].as_int ==
- frame_mv[NEARESTMV][ref_frame].as_int)
+ frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+ rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+ frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+ cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis,
+ &x->pred_sse[ref_frame], NULL, 0, 0);
+ } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost)) {
continue;
+ }
+ }
- mbmi->mode = this_mode;
- mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-
- // Search for the best prediction filter type, when the resulting
- // motion vector is at sub-pixel accuracy level for luma component, i.e.,
- // the last three bits are all zeros.
- if (cpi->sf.reuse_inter_pred_sby) {
- if (this_mode == NEARESTMV) {
- this_mode_pred = &tmp[3];
- } else {
- this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
- pd->dst.buf = this_mode_pred->data;
- pd->dst.stride = bw;
- }
+ if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+ frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
+ const int pre_stride = xd->plane[0].pre[0].stride;
+ const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+ (frame_mv[NEWMV][LAST_FRAME].as_mv.row >> 3) * pre_stride +
+ (frame_mv[NEWMV][LAST_FRAME].as_mv.col >> 3);
+ best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+ x->plane[0].src.stride,
+ pre_buf, pre_stride);
+ x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
+ }
+
+ if (this_mode != NEARESTMV &&
+ frame_mv[this_mode][ref_frame].as_int ==
+ frame_mv[NEARESTMV][ref_frame].as_int)
+ continue;
+
+ mbmi->mode = this_mode;
+ mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+
+ // Search for the best prediction filter type, when the resulting
+ // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+ // the last three bits are all zeros.
+ if (reuse_inter_pred) {
+ if (!this_mode_pred) {
+ this_mode_pred = &tmp[3];
+ } else {
+ this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = bw;
}
+ }
+
+ if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
+ && (ref_frame == LAST_FRAME)
+ && (((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07) != 0)) {
+ int pf_rate[3];
+ int64_t pf_dist[3];
+ unsigned int pf_var[3];
+ unsigned int pf_sse[3];
+ TX_SIZE pf_tx_size[3];
+ int64_t best_cost = INT64_MAX;
+ INTERP_FILTER best_filter = SWITCHABLE, filter;
+ PRED_BUFFER *current_pred = this_mode_pred;
+
+ for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
+ int64_t cost;
+ mbmi->interp_filter = filter;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+ &pf_var[filter], &pf_sse[filter]);
+ pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+ cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+ pf_tx_size[filter] = mbmi->tx_size;
+ if (cost < best_cost) {
+ best_filter = filter;
+ best_cost = cost;
+ skip_txfm = x->skip_txfm[0];
+
+ if (reuse_inter_pred) {
+ if (this_mode_pred != current_pred) {
+ free_pred_buffer(this_mode_pred);
+ this_mode_pred = current_pred;
+ }
- if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
- pred_filter_search &&
- ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
- (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
- int pf_rate[3];
- int64_t pf_dist[3];
- unsigned int pf_var[3];
- unsigned int pf_sse[3];
- TX_SIZE pf_tx_size[3];
- int64_t best_cost = INT64_MAX;
- INTERP_FILTER best_filter = SWITCHABLE, filter;
- PRED_BUFFER *current_pred = this_mode_pred;
-
- for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
- int64_t cost;
- mbmi->interp_filter = filter;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
- &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
- cost = RDCOST(x->rdmult, x->rddiv,
- vp9_get_switchable_rate(cpi) + pf_rate[filter],
- pf_dist[filter]);
- pf_tx_size[filter] = mbmi->tx_size;
- if (cost < best_cost) {
- best_filter = filter;
- best_cost = cost;
- skip_txfm = x->skip_txfm[0];
-
- if (cpi->sf.reuse_inter_pred_sby) {
- if (this_mode_pred != current_pred) {
- free_pred_buffer(this_mode_pred);
- this_mode_pred = current_pred;
- }
-
- if (filter < EIGHTTAP_SHARP) {
- current_pred = &tmp[get_pred_buffer(tmp, 3)];
- pd->dst.buf = current_pred->data;
- pd->dst.stride = bw;
- }
+ if (filter < EIGHTTAP_SHARP) {
+ current_pred = &tmp[get_pred_buffer(tmp, 3)];
+ pd->dst.buf = current_pred->data;
+ pd->dst.stride = bw;
}
}
}
+ }
- if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
- free_pred_buffer(current_pred);
-
- mbmi->interp_filter = best_filter;
- mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
- rate = pf_rate[mbmi->interp_filter];
- dist = pf_dist[mbmi->interp_filter];
- var_y = pf_var[mbmi->interp_filter];
- sse_y = pf_sse[mbmi->interp_filter];
- x->skip_txfm[0] = skip_txfm;
+ if (reuse_inter_pred && this_mode_pred != current_pred)
+ free_pred_buffer(current_pred);
+
+ mbmi->interp_filter = best_filter;
+ mbmi->tx_size = pf_tx_size[best_filter];
+ this_rdc.rate = pf_rate[best_filter];
+ this_rdc.dist = pf_dist[best_filter];
+ var_y = pf_var[best_filter];
+ sse_y = pf_sse[best_filter];
+ x->skip_txfm[0] = skip_txfm;
+ if (reuse_inter_pred) {
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = this_mode_pred->stride;
+ }
+ } else {
+ mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+ // For large partition blocks, extra testing is done.
+ if (bsize > BLOCK_32X32 &&
+ !cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id) &&
+ cm->base_qindex) {
+ model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+ &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+ &this_early_term);
} else {
- mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y);
}
+ }
- rate += rate_mv;
- rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
- [INTER_OFFSET(this_mode)];
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
-
- // Skipping checking: test to see if this block can be reconstructed by
- // prediction only.
- if (cpi->allow_encode_breakout) {
- encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
- this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
- if (x->skip) {
- rate += rate_mv;
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+ if (!this_early_term) {
+ this_sse = (int64_t)sse_y;
+ block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
+ &this_sse, 0, bsize, MIN(mbmi->tx_size, TX_16X16));
+ x->skip_txfm[0] = is_skippable;
+ if (is_skippable) {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ } else {
+ if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+ RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+ } else {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ this_rdc.dist = this_sse;
+ x->skip_txfm[0] = 1;
}
}
-#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+ if (cm->interp_filter == SWITCHABLE) {
+ if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+ this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
}
-#else
- (void)ctx;
-#endif
-
- if (this_rd < best_rd || x->skip) {
- best_rd = this_rd;
- *returnrate = rate;
- *returndistortion = dist;
- best_mode = this_mode;
- best_pred_filter = mbmi->interp_filter;
- best_tx_size = mbmi->tx_size;
- best_ref_frame = ref_frame;
- skip_txfm = x->skip_txfm[0];
+ } else {
+ this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+ vp9_get_switchable_rate(cpi, xd) : 0;
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ }
- if (cpi->sf.reuse_inter_pred_sby) {
- free_pred_buffer(best_pred);
+ if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+ int uv_rate = 0;
+ int64_t uv_dist = 0;
+ if (x->color_sensitivity[0])
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+ if (x->color_sensitivity[1])
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+ model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+ &var_y, &sse_y);
+ this_rdc.rate += uv_rate;
+ this_rdc.dist += uv_dist;
+ }
- best_pred = this_mode_pred;
- }
- } else {
- if (cpi->sf.reuse_inter_pred_sby)
- free_pred_buffer(this_mode_pred);
+ this_rdc.rate += rate_mv;
+ this_rdc.rate +=
+ cpi->inter_mode_cost[mbmi->mode_context[ref_frame]][INTER_OFFSET(
+ this_mode)];
+ this_rdc.rate += ref_frame_cost[ref_frame];
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+ // Skipping checking: test to see if this block can be reconstructed by
+ // prediction only.
+ if (cpi->allow_encode_breakout) {
+ encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
+ var_y, sse_y, yv12_mb, &this_rdc.rate,
+ &this_rdc.dist);
+ if (x->skip) {
+ this_rdc.rate += rate_mv;
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate,
+ this_rdc.dist);
}
+ }
- if (x->skip)
- break;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0)
+ vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+#else
+ (void)ctx;
+#endif
+
+ if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+ best_rdc = this_rdc;
+ best_mode = this_mode;
+ best_pred_filter = mbmi->interp_filter;
+ best_tx_size = mbmi->tx_size;
+ best_ref_frame = ref_frame;
+ best_mode_skip_txfm = x->skip_txfm[0];
+ best_early_term = this_early_term;
+
+ if (reuse_inter_pred) {
+ free_pred_buffer(best_pred);
+ best_pred = this_mode_pred;
+ }
+ } else {
+ if (reuse_inter_pred)
+ free_pred_buffer(this_mode_pred);
}
- // If the current reference frame is valid and we found a usable mode,
- // we are done.
- if (best_rd < INT64_MAX)
+
+ if (x->skip)
break;
- }
- // If best prediction is not in dst buf, then copy the prediction block from
- // temp buf to dst buf.
- if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
- best_pred->data != orig_dst.buf) {
- pd->dst = orig_dst;
-#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- vp9_highbd_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
- NULL, 0, NULL, 0, bw, bh, xd->bd);
- } else {
- vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
- NULL, 0, NULL, 0, bw, bh);
+ // If early termination flag is 1 and at least 2 modes are checked,
+ // the mode search is terminated.
+ if (best_early_term && idx > 0) {
+ x->skip = 1;
+ break;
}
-#else
- vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
- NULL, 0, bw, bh);
-#endif // CONFIG_VP9_HIGHBITDEPTH
}
mbmi->mode = best_mode;
@@ -785,53 +1484,404 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->tx_size = best_tx_size;
mbmi->ref_frame[0] = best_ref_frame;
mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
- xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
- x->skip_txfm[0] = skip_txfm;
+ xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+ x->skip_txfm[0] = best_mode_skip_txfm;
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
- if (!x->skip && best_rd > inter_mode_thresh &&
- bsize <= cpi->sf.max_intra_bsize) {
- PREDICTION_MODE this_mode;
+ if (best_rdc.rdcost == INT64_MAX ||
+ (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+ bsize <= cpi->sf.max_intra_bsize)) {
struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
const TX_SIZE intra_tx_size =
MIN(max_txsize_lookup[bsize],
tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ int i;
+ TX_SIZE best_intra_tx_size = TX_SIZES;
- if (cpi->sf.reuse_inter_pred_sby) {
- pd->dst.buf = tmp[0].data;
- pd->dst.stride = bw;
+ if (reuse_inter_pred && best_pred != NULL) {
+ if (best_pred->data == orig_dst.buf) {
+ this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride,
+ NULL, 0, NULL, 0, bw, bh, xd->bd);
+ else
+ vp9_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride,
+ NULL, 0, NULL, 0, bw, bh);
+#else
+ vp9_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride,
+ NULL, 0, NULL, 0, bw, bh);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ best_pred = this_mode_pred;
+ }
}
+ pd->dst = orig_dst;
+
+ for (i = 0; i < 4; ++i) {
+ const PREDICTION_MODE this_mode = intra_mode_list[i];
+ THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+ int mode_rd_thresh = rd_threshes[mode_index];
+
+ if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
+ continue;
- for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
- const TX_SIZE saved_tx_size = mbmi->tx_size;
+ if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+ rd_thresh_freq_fact[mode_index]))
+ continue;
+
+ mbmi->mode = this_mode;
+ mbmi->ref_frame[0] = INTRA_FRAME;
args.mode = this_mode;
args.rate = 0;
args.dist = 0;
mbmi->tx_size = intra_tx_size;
vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
estimate_block_intra, &args);
- mbmi->tx_size = saved_tx_size;
- rate = args.rate;
- dist = args.dist;
- rate += cpi->mbmode_cost[this_mode];
- rate += intra_cost_penalty;
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
-
- if (this_rd + intra_mode_cost < best_rd) {
- best_rd = this_rd;
- *returnrate = rate;
- *returndistortion = dist;
- mbmi->mode = this_mode;
- mbmi->tx_size = intra_tx_size;
- mbmi->ref_frame[0] = INTRA_FRAME;
+ this_rdc.rate = args.rate;
+ this_rdc.dist = args.dist;
+ this_rdc.rate += cpi->mbmode_cost[this_mode];
+ this_rdc.rate += ref_frame_cost[INTRA_FRAME];
+ this_rdc.rate += intra_cost_penalty;
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ this_rdc.rate, this_rdc.dist);
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ best_mode = this_mode;
+ best_intra_tx_size = mbmi->tx_size;
+ best_ref_frame = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
- } else {
- x->skip_txfm[0] = skip_txfm;
+ best_mode_skip_txfm = x->skip_txfm[0];
+ }
+ }
+
+ // Reset mb_mode_info to the best inter mode.
+ if (best_ref_frame != INTRA_FRAME) {
+ mbmi->tx_size = best_tx_size;
+ } else {
+ mbmi->tx_size = best_intra_tx_size;
+ }
+ }
+
+ pd->dst = orig_dst;
+ mbmi->mode = best_mode;
+ mbmi->ref_frame[0] = best_ref_frame;
+ x->skip_txfm[0] = best_mode_skip_txfm;
+
+ if (reuse_inter_pred && best_pred != NULL) {
+ if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+ pd->dst.buf, pd->dst.stride, NULL, 0,
+ NULL, 0, bw, bh, xd->bd);
+ else
+ vp9_convolve_copy(best_pred->data, best_pred->stride,
+ pd->dst.buf, pd->dst.stride, NULL, 0,
+ NULL, 0, bw, bh);
+#else
+ vp9_convolve_copy(best_pred->data, best_pred->stride,
+ pd->dst.buf, pd->dst.stride, NULL, 0,
+ NULL, 0, bw, bh);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+
+ if (cpi->sf.adaptive_rd_thresh) {
+ THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)];
+ PREDICTION_MODE this_mode;
+
+ if (best_ref_frame == INTRA_FRAME) {
+ // Only consider the modes that are included in the intra_mode_list.
+ int intra_modes = sizeof(intra_mode_list)/sizeof(PREDICTION_MODE);
+ int i;
+
+ // TODO(yunqingwang): Check intra mode mask and only update freq_fact
+ // for those valid modes.
+ for (i = 0; i < intra_modes; i++) {
+ PREDICTION_MODE this_mode = intra_mode_list[i];
+ update_thresh_freq_fact(cpi, tile_data, bsize, INTRA_FRAME,
+ best_mode_idx, this_mode);
+ }
+ } else {
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ if (best_ref_frame != ref_frame) continue;
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ update_thresh_freq_fact(cpi, tile_data, bsize, ref_frame,
+ best_mode_idx, this_mode);
+ }
}
}
- if (cpi->sf.reuse_inter_pred_sby)
- pd->dst = orig_dst;
}
+
+ *rd_cost = best_rdc;
+}
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const struct segmentation *const seg = &cm->seg;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+ MV_REFERENCE_FRAME best_ref_frame = NONE;
+ unsigned char segment_id = mbmi->segment_id;
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ int64_t best_rd = INT64_MAX;
+ b_mode_info bsi[MAX_REF_FRAMES][4];
+ int ref_frame_skip_mask = 0;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+ ctx->pred_pixel_ready = 0;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ int_mv dummy_mv[2];
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+
+ if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+ int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+ const struct scale_factors *const sf =
+ &cm->frame_refs[ref_frame - 1].sf;
+ vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+ sf, sf);
+ vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
+ candidates, mi_row, mi_col, NULL, NULL);
+
+ vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+ &dummy_mv[0], &dummy_mv[1]);
+ } else {
+ ref_frame_skip_mask |= (1 << ref_frame);
+ }
+ }
+
+ mbmi->sb_type = bsize;
+ mbmi->tx_size = TX_4X4;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE;
+ mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+ : cm->interp_filter;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ int64_t this_rd = 0;
+ int plane;
+
+ if (ref_frame_skip_mask & (1 << ref_frame))
+ continue;
+
+ // TODO(jingning, agrange): Scaling reference frame not supported for
+ // sub8x8 blocks. Is this supported now?
+ if (ref_frame > INTRA_FRAME &&
+ vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+ continue;
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+ continue;
+
+ mbmi->ref_frame[0] = ref_frame;
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (plane = 0; plane < MAX_MB_PLANE; plane++)
+ xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ int_mv b_mv[MB_MODE_COUNT];
+ int64_t b_best_rd = INT64_MAX;
+ const int i = idy * 2 + idx;
+ PREDICTION_MODE this_mode;
+ RD_COST this_rdc;
+ unsigned int var_y, sse_y;
+
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+
+ const struct buf_2d orig_src = p->src;
+ const struct buf_2d orig_dst = pd->dst;
+ struct buf_2d orig_pre[2];
+ memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+ // set buffer pointers for sub8x8 motion search.
+ p->src.buf =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ pd->dst.buf =
+ &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+ pd->pre[0].buf =
+ &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8,
+ i, pd->pre[0].stride)];
+
+ b_mv[ZEROMV].as_int = 0;
+ b_mv[NEWMV].as_int = INVALID_MV;
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col,
+ &b_mv[NEARESTMV],
+ &b_mv[NEARMV]);
+
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ int b_rate = 0;
+ xd->mi[0]->bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+ if (this_mode == NEWMV) {
+ const int step_param = cpi->sf.mv.fullpel_search_step_param;
+ MV mvp_full;
+ MV tmp_mv;
+ int cost_list[5];
+ const int tmp_col_min = x->mv_col_min;
+ const int tmp_col_max = x->mv_col_max;
+ const int tmp_row_min = x->mv_row_min;
+ const int tmp_row_max = x->mv_row_max;
+ int dummy_dist;
+
+ if (i == 0) {
+ mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+ mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+ } else {
+ mvp_full.row = xd->mi[0]->bmi[0].as_mv[0].as_mv.row >> 3;
+ mvp_full.col = xd->mi[0]->bmi[0].as_mv[0].as_mv.col >> 3;
+ }
+
+ vp9_set_mv_search_range(x, &mbmi->ref_mvs[0]->as_mv);
+
+ vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, x->sadperbit4,
+ cond_cost_list(cpi, cost_list),
+ &mbmi->ref_mvs[ref_frame][0].as_mv, &tmp_mv,
+ INT_MAX, 0);
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ // calculate the bit cost on motion vector
+ mvp_full.row = tmp_mv.row * 8;
+ mvp_full.col = tmp_mv.col * 8;
+
+ b_rate += vp9_mv_bit_cost(&mvp_full,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ x->nmvjointcost, x->mvcost,
+ MV_COST_WEIGHT);
+
+ b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+ [INTER_OFFSET(NEWMV)];
+ if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd)
+ continue;
+
+ cpi->find_fractional_mv_step(x, &tmp_mv,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost,
+ &dummy_dist,
+ &x->pred_sse[ref_frame], NULL, 0, 0);
+
+ xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
+ } else {
+ b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+ [INTER_OFFSET(this_mode)];
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+ pd->dst.buf, pd->dst.stride,
+ &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+ &xd->block_refs[0]->sf,
+ 4 * num_4x4_blocks_wide,
+ 4 * num_4x4_blocks_high, 0,
+ vp9_get_interp_kernel(mbmi->interp_filter),
+ MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i & 0x01),
+ mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
+ } else {
+#endif
+ vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+ pd->dst.buf, pd->dst.stride,
+ &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+ &xd->block_refs[0]->sf,
+ 4 * num_4x4_blocks_wide,
+ 4 * num_4x4_blocks_high, 0,
+ vp9_get_interp_kernel(mbmi->interp_filter),
+ MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i & 0x01),
+ mi_row * MI_SIZE + 4 * (i >> 1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif
+
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y);
+
+ this_rdc.rate += b_rate;
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < b_best_rd) {
+ b_best_rd = this_rdc.rdcost;
+ bsi[ref_frame][i].as_mode = this_mode;
+ bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0]->bmi[i].as_mv[0].as_mv;
+ }
+ } // mode search
+
+ // restore source and prediction buffer pointers.
+ p->src = orig_src;
+ pd->pre[0] = orig_pre[0];
+ pd->dst = orig_dst;
+ this_rd += b_best_rd;
+
+ xd->mi[0]->bmi[i] = bsi[ref_frame][i];
+ if (num_4x4_blocks_wide > 1)
+ xd->mi[0]->bmi[i + 1] = xd->mi[0]->bmi[i];
+ if (num_4x4_blocks_high > 1)
+ xd->mi[0]->bmi[i + 2] = xd->mi[0]->bmi[i];
+ }
+ } // loop through sub8x8 blocks
+
+ if (this_rd < best_rd) {
+ best_rd = this_rd;
+ best_ref_frame = ref_frame;
+ }
+ } // reference frames
+
+ mbmi->tx_size = TX_4X4;
+ mbmi->ref_frame[0] = best_ref_frame;
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ const int block = idy * 2 + idx;
+ xd->mi[0]->bmi[block] = bsi[best_ref_frame][block];
+ if (num_4x4_blocks_wide > 1)
+ xd->mi[0]->bmi[block + 1] = bsi[best_ref_frame][block];
+ if (num_4x4_blocks_high > 1)
+ xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block];
+ }
+ }
+ mbmi->mode = xd->mi[0]->bmi[3].as_mode;
+ ctx->mic = *(xd->mi[0]);
+ ctx->skip_txfm[0] = 0;
+ ctx->skip = 0;
+ // Dummy assignment for speed -5. No effect in speed -6.
+ rd_cost->rdcost = best_rd;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
index 97aeca76a7d..11f44099c1f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
@@ -17,14 +17,21 @@
extern "C" {
#endif
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- const struct TileInfo *const tile,
- int mi_row, int mi_col,
- int *returnrate,
- int64_t *returndistortion,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx);
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c
new file mode 100644
index 00000000000..e10e0284c58
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * This code was originally written by: Gregory Maxwell, at the Daala
+ * project.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/vp9_ssim.h"
+
+#if !defined(M_PI)
+# define M_PI (3.141592653589793238462643)
+#endif
+#include <string.h>
+
+void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) {
+ (void) xstride;
+ vp9_fdct8x8_c(x, y, ystride);
+}
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+float csf_y[8][8] = {{1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411,
+ 1.00227514334, 0.678296995242, 0.466224900598, 0.3265091542}, {2.2901594831,
+ 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, 0.868920337363,
+ 0.61280991668, 0.436405793551}, {2.08509755623, 2.04793073064,
+ 1.34329019223, 1.09205635862, 0.875748795257, 0.670882927016,
+ 0.501731932449, 0.372504254596}, {1.48366094411, 1.68731108984,
+ 1.09205635862, 0.772819797575, 0.605636379554, 0.48309405692,
+ 0.380429446972, 0.295774038565}, {1.00227514334, 1.2305666963,
+ 0.875748795257, 0.605636379554, 0.448996256676, 0.352889268808,
+ 0.283006984131, 0.226951348204}, {0.678296995242, 0.868920337363,
+ 0.670882927016, 0.48309405692, 0.352889268808, 0.27032073436,
+ 0.215017739696, 0.17408067321}, {0.466224900598, 0.61280991668,
+ 0.501731932449, 0.380429446972, 0.283006984131, 0.215017739696,
+ 0.168869545842, 0.136153931001}, {0.3265091542, 0.436405793551,
+ 0.372504254596, 0.295774038565, 0.226951348204, 0.17408067321,
+ 0.136153931001, 0.109083846276}};
+float csf_cb420[8][8] = {
+ {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+ 0.898018824055, 0.74725392039, 0.615105596242}, {2.46074210438,
+ 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+ 1.17428548929, 0.996404342439, 0.830890433625}, {1.18284184739,
+ 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+ 0.960060382087, 0.849823426169, 0.731221236837}, {1.14982565193,
+ 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+ 0.751437590932, 0.685398513368, 0.608694761374}, {1.05017074788,
+ 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+ 0.605503172737, 0.55002013668, 0.495804539034}, {0.898018824055,
+ 1.17428548929, 0.960060382087, 0.751437590932, 0.605503172737,
+ 0.514674450957, 0.454353482512, 0.407050308965}, {0.74725392039,
+ 0.996404342439, 0.849823426169, 0.685398513368, 0.55002013668,
+ 0.454353482512, 0.389234902883, 0.342353999733}, {0.615105596242,
+ 0.830890433625, 0.731221236837, 0.608694761374, 0.495804539034,
+ 0.407050308965, 0.342353999733, 0.295530605237}};
+float csf_cr420[8][8] = {
+ {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+ 0.867069376285, 0.721500455585, 0.593906509971}, {2.62502345193,
+ 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+ 1.13381474809, 0.962064122248, 0.802254508198}, {1.26180942886,
+ 1.17180569821, 0.944981930573, 0.990876405848, 0.995903384143,
+ 0.926972725286, 0.820534991409, 0.706020324706}, {1.11019789803,
+ 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+ 0.725539939514, 0.661776842059, 0.587716619023}, {1.01397751469,
+ 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+ 0.584635025748, 0.531064164893, 0.478717061273}, {0.867069376285,
+ 1.13381474809, 0.926972725286, 0.725539939514, 0.584635025748,
+ 0.496936637883, 0.438694579826, 0.393021669543}, {0.721500455585,
+ 0.962064122248, 0.820534991409, 0.661776842059, 0.531064164893,
+ 0.438694579826, 0.375820256136, 0.330555063063}, {0.593906509971,
+ 0.802254508198, 0.706020324706, 0.587716619023, 0.478717061273,
+ 0.393021669543, 0.330555063063, 0.285345396658}};
+
+static double convert_score_db(double _score, double _weight) {
+ return 10 * (log10(255 * 255) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *_src, int _systride,
+ const unsigned char *_dst, int _dystride,
+ double _par, int _w, int _h, int _step,
+ float _csf[8][8]) {
+ float ret;
+ int16_t dct_s[8 * 8], dct_d[8 * 8];
+ tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+ float mask[8][8];
+ int pixels;
+ int x;
+ int y;
+ (void) _par;
+ ret = pixels = 0;
+ /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+ their masking table as "we have used the quantization table for the
+ color component Y of JPEG [6] that has been also obtained on the
+ basis of CSF. Note that the values in quantization table JPEG have
+ been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+ was also constructed from the JPEG matrices. I can not find any obvious
+ scheme of normalizing to produce their table, but if I multiply their
+ CSF by 0.38857 and square the result I get their masking table.
+ I have no idea where this constant comes from, but deviating from it
+ too greatly hurts MOS agreement.
+
+ [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+ Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+ of DCT basis functions", CD-ROM Proceedings of the Third
+ International Workshop on Video Processing and Quality Metrics for Consumer
+ Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+ for (x = 0; x < 8; x++)
+ for (y = 0; y < 8; y++)
+ mask[x][y] = (_csf[x][y] * 0.3885746225901003)
+ * (_csf[x][y] * 0.3885746225901003);
+ for (y = 0; y < _h - 7; y += _step) {
+ for (x = 0; x < _w - 7; x += _step) {
+ int i;
+ int j;
+ float s_means[4];
+ float d_means[4];
+ float s_vars[4];
+ float d_vars[4];
+ float s_gmean = 0;
+ float d_gmean = 0;
+ float s_gvar = 0;
+ float d_gvar = 0;
+ float s_mask = 0;
+ float d_mask = 0;
+ for (i = 0; i < 4; i++)
+ s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+ dct_s[i * 8 + j] = _src[(y + i) * _systride + (j + x)];
+ dct_d[i * 8 + j] = _dst[(y + i) * _dystride + (j + x)];
+ s_gmean += dct_s[i * 8 + j];
+ d_gmean += dct_d[i * 8 + j];
+ s_means[sub] += dct_s[i * 8 + j];
+ d_means[sub] += dct_d[i * 8 + j];
+ }
+ }
+ s_gmean /= 64.f;
+ d_gmean /= 64.f;
+ for (i = 0; i < 4; i++)
+ s_means[i] /= 16.f;
+ for (i = 0; i < 4; i++)
+ d_means[i] /= 16.f;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+ s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+ d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+ s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub])
+ * (dct_s[i * 8 + j] - s_means[sub]);
+ d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub])
+ * (dct_d[i * 8 + j] - d_means[sub]);
+ }
+ }
+ s_gvar *= 1 / 63.f * 64;
+ d_gvar *= 1 / 63.f * 64;
+ for (i = 0; i < 4; i++)
+ s_vars[i] *= 1 / 15.f * 16;
+ for (i = 0; i < 4; i++)
+ d_vars[i] *= 1 / 15.f * 16;
+ if (s_gvar > 0)
+ s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+ if (d_gvar > 0)
+ d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+ od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+ s_mask = sqrt(s_mask * s_gvar) / 32.f;
+ d_mask = sqrt(d_mask * d_gvar) / 32.f;
+ if (d_mask > s_mask)
+ s_mask = d_mask;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ float err;
+ err = fabs(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]);
+ if (i != 0 || j != 0)
+ err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+ ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+ pixels++;
+ }
+ }
+ }
+ }
+ ret /= pixels;
+ return ret;
+}
+double vp9_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+ double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs) {
+ double psnrhvs;
+ double par = 1.0;
+ int step = 7;
+ vp9_clear_system_state();
+ *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer,
+ dest->y_stride, par, source->y_crop_width,
+ source->y_crop_height, step, csf_y);
+
+ *u_psnrhvs = calc_psnrhvs(source->u_buffer, source->uv_stride, dest->u_buffer,
+ dest->uv_stride, par, source->uv_crop_width,
+ source->uv_crop_height, step, csf_cb420);
+
+ *v_psnrhvs = calc_psnrhvs(source->v_buffer, source->uv_stride, dest->v_buffer,
+ dest->uv_stride, par, source->uv_crop_width,
+ source->uv_crop_height, step, csf_cr420);
+ psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+
+ return convert_score_db(psnrhvs, 1.0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
index 2ba1f922bc7..3c07e2c2437 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
@@ -19,7 +19,8 @@
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
-void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+ int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
@@ -29,6 +30,9 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
int tmp, eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
if (!skip_block) {
tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 16;
@@ -41,12 +45,16 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+ int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
if (!skip_block) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
@@ -56,7 +64,7 @@ void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
const int64_t tmp =
(clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
quant) >> 16;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
if (tmp)
eob = 0;
@@ -69,15 +77,20 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ const int n_coeffs = 1024;
const int rc = 0;
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
int tmp, eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
if (!skip_block) {
- tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+ INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
@@ -96,8 +109,12 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr,
uint16_t *eob_ptr) {
+ const int n_coeffs = 1024;
int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
if (!skip_block) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
@@ -105,9 +122,9 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
const int64_t tmp =
- (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
- quant) >> 15;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+ INT32_MIN, INT32_MAX) * quant) >> 15;
+ qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
if (tmp)
eob = 0;
@@ -122,18 +139,17 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Quantization pass: All coefficients with index >= zero_flag are
@@ -168,7 +184,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value,
uint16_t *eob_ptr,
const int16_t *scan,
const int16_t *iscan) {
@@ -178,11 +193,10 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Quantization pass: All coefficients with index >= zero_flag are
@@ -197,7 +211,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
(clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
quant_ptr[rc != 0]) >> 16;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
if (tmp)
@@ -217,16 +231,15 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
for (i = 0; i < n_coeffs; i++) {
@@ -261,16 +274,15 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, eob = -1;
(void)zbin_ptr;
(void)quant_shift_ptr;
- (void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
for (i = 0; i < n_coeffs; i++) {
@@ -284,7 +296,7 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
INT32_MIN, INT32_MAX);
tmp = (tmp * quant_ptr[rc != 0]) >> 15;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
}
@@ -302,17 +314,15 @@ void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
- const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
- zbin_ptr[1] + zbin_oq_value };
- const int nzbins[2] = { zbins[0] * -1,
- zbins[1] * -1 };
+ const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+ const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
@@ -355,18 +365,16 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, int zbin_oq_value,
+ const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
int i, non_zero_count = (int)n_coeffs, eob = -1;
- const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
- zbin_ptr[1] + zbin_oq_value };
- const int nzbins[2] = { zbins[0] * -1,
- zbins[1] * -1 };
+ const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+ const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
@@ -393,7 +401,7 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
INT32_MIN, INT32_MAX);
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
quant_shift_ptr[rc != 0]) >> 16; // quantization
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
if (tmp)
@@ -412,10 +420,10 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+ const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
int idx = 0;
@@ -423,8 +431,8 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int i, eob = -1;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
@@ -471,19 +479,19 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr,
+ uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
- ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
- const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+ const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
int idx = 0;
int idx_arr[1024];
int i, eob = -1;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
@@ -510,7 +518,7 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
quant_shift_ptr[rc != 0]) >> 15;
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (tmp)
@@ -530,21 +538,21 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
- 16, x->skip_block,
- p->zbin, p->round, p->quant, p->quant_shift,
- BLOCK_OFFSET(p->qcoeff, block),
- BLOCK_OFFSET(pd->dqcoeff, block),
- pd->dequant, p->zbin_extra, &p->eobs[block],
- scan, iscan);
+ 16, x->skip_block,
+ p->zbin, p->round, p->quant, p->quant_shift,
+ BLOCK_OFFSET(p->qcoeff, block),
+ BLOCK_OFFSET(pd->dqcoeff, block),
+ pd->dequant, &p->eobs[block],
+ scan, iscan);
return;
}
#endif
vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
- 16, x->skip_block,
- p->zbin, p->round, p->quant, p->quant_shift,
- BLOCK_OFFSET(p->qcoeff, block),
- BLOCK_OFFSET(pd->dqcoeff, block),
- pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+ 16, x->skip_block,
+ p->zbin, p->round, p->quant, p->quant_shift,
+ BLOCK_OFFSET(p->qcoeff, block),
+ BLOCK_OFFSET(pd->dqcoeff, block),
+ pd->dequant, &p->eobs[block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -600,7 +608,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
- cm->y_dequant[q][i] = quant;
+ cpi->y_dequant[q][i] = quant;
// uv
quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
@@ -611,7 +619,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
- cm->uv_dequant[q][i] = quant;
+ cpi->uv_dequant[q][i] = quant;
}
for (i = 2; i < 8; i++) {
@@ -621,7 +629,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
quants->y_zbin[q][i] = quants->y_zbin[q][1];
quants->y_round[q][i] = quants->y_round[q][1];
- cm->y_dequant[q][i] = cm->y_dequant[q][1];
+ cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
quants->uv_quant[q][i] = quants->uv_quant[q][1];
quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
@@ -629,7 +637,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
quants->uv_round[q][i] = quants->uv_round[q][1];
- cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
+ cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
}
}
}
@@ -638,10 +646,9 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
const VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
QUANTS *const quants = &cpi->quants;
- const int segment_id = xd->mi[0].src_mi->mbmi.segment_id;
+ const int segment_id = xd->mi[0]->mbmi.segment_id;
const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
- const int zbin = cpi->zbin_mode_boost;
int i;
// Y
@@ -651,13 +658,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
x->plane[0].zbin = quants->y_zbin[qindex];
x->plane[0].round = quants->y_round[qindex];
- x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
- xd->plane[0].dequant = cm->y_dequant[qindex];
+ xd->plane[0].dequant = cpi->y_dequant[qindex];
- x->plane[0].quant_thred[0] = (x->plane[0].zbin[0] + x->plane[0].zbin_extra) *
- (x->plane[0].zbin[0] + x->plane[0].zbin_extra);
- x->plane[0].quant_thred[1] = (x->plane[0].zbin[1] + x->plane[0].zbin_extra) *
- (x->plane[0].zbin[1] + x->plane[0].zbin_extra);
+ x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+ x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
// UV
for (i = 1; i < 3; i++) {
@@ -667,15 +671,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
x->plane[i].zbin = quants->uv_zbin[qindex];
x->plane[i].round = quants->uv_round[qindex];
- x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
- xd->plane[i].dequant = cm->uv_dequant[qindex];
-
- x->plane[i].quant_thred[0] =
- (x->plane[i].zbin[0] + x->plane[i].zbin_extra) *
- (x->plane[i].zbin[0] + x->plane[i].zbin_extra);
- x->plane[i].quant_thred[1] =
- (x->plane[i].zbin[1] + x->plane[i].zbin_extra) *
- (x->plane[i].zbin[1] + x->plane[i].zbin_extra);
+ xd->plane[i].dequant = cpi->uv_dequant[qindex];
+
+ x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+ x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
}
x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
@@ -684,24 +683,11 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
x->errorperbit = rdmult >> 6;
x->errorperbit += (x->errorperbit == 0);
- vp9_initialize_me_consts(cpi, x->q_index);
-}
-
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
- const int qindex = x->q_index;
- const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
- cpi->zbin_mode_boost) >> 7;
- const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
- cpi->zbin_mode_boost) >> 7;
-
- x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
- x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
- x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
+ vp9_initialize_me_consts(cpi, x, x->q_index);
}
void vp9_frame_init_quantizer(VP9_COMP *cpi) {
- cpi->zbin_mode_boost = 0;
- vp9_init_plane_quantizers(cpi, &cpi->mb);
+ vp9_init_plane_quantizers(cpi, &cpi->td.mb);
}
void vp9_set_quantizer(VP9_COMMON *cm, int q) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
index cee46e7e033..55e546944a7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
@@ -37,7 +37,8 @@ typedef struct {
DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
} QUANTS;
-void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+ int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -49,7 +50,8 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+ int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -68,8 +70,6 @@ struct VP9Common;
void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
-void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
void vp9_init_quantizer(struct VP9_COMP *cpi);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index 65bca669a89..4c33ffd977b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -18,6 +18,7 @@
#include "vpx_mem/vpx_mem.h"
#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_quant_common.h"
@@ -185,9 +186,9 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
return (int)(enumerator * correction_factor / q);
}
-static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
- double correction_factor,
- vpx_bit_depth_t bit_depth) {
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+ double correction_factor,
+ vpx_bit_depth_t bit_depth) {
const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
bit_depth));
return MAX(FRAME_OVERHEAD_BITS,
@@ -196,6 +197,7 @@ static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
const RATE_CONTROL *rc = &cpi->rc;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
const int min_frame_target = MAX(rc->min_frame_bandwidth,
rc->avg_frame_bandwidth >> 5);
if (target < min_frame_target)
@@ -210,6 +212,11 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
// Clip the frame target to the maximum allowed value.
if (target > rc->max_frame_bandwidth)
target = rc->max_frame_bandwidth;
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate = rc->avg_frame_bandwidth *
+ oxcf->rc_max_inter_bitrate_pct / 100;
+ target = MIN(target, max_rate);
+ }
return target;
}
@@ -226,7 +233,6 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
return target;
}
-
// Update the buffer level for higher layers, given the encoded current layer.
static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
int temporal_layer = 0;
@@ -354,26 +360,34 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
static double get_rate_correction_factor(const VP9_COMP *cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
+ double rcf;
if (cpi->common.frame_type == KEY_FRAME) {
- return rc->rate_correction_factors[KF_STD];
+ rcf = rc->rate_correction_factors[KF_STD];
} else if (cpi->oxcf.pass == 2) {
RATE_FACTOR_LEVEL rf_lvl =
cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
- return rc->rate_correction_factors[rf_lvl];
+ rcf = rc->rate_correction_factors[rf_lvl];
} else {
if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
- !rc->is_src_frame_alt_ref &&
- !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
- return rc->rate_correction_factors[GF_ARF_STD];
+ !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+ (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ rcf = rc->rate_correction_factors[GF_ARF_STD];
else
- return rc->rate_correction_factors[INTER_NORMAL];
+ rcf = rc->rate_correction_factors[INTER_NORMAL];
}
+ rcf *= rcf_mult[rc->frame_size_selector];
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
}
static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
RATE_CONTROL *const rc = &cpi->rc;
+ // Normalize RCF to account for the size-dependent scaling factor.
+ factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
if (cpi->common.frame_type == KEY_FRAME) {
rc->rate_correction_factors[KF_STD] = factor;
} else if (cpi->oxcf.pass == 2) {
@@ -382,15 +396,15 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
rc->rate_correction_factors[rf_lvl] = factor;
} else {
if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
- !rc->is_src_frame_alt_ref &&
- !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
+ !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+ (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
rc->rate_correction_factors[GF_ARF_STD] = factor;
else
rc->rate_correction_factors[INTER_NORMAL] = factor;
}
}
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
const VP9_COMMON *const cm = &cpi->common;
int correction_factor = 100;
double rate_correction_factor = get_rate_correction_factor(cpi);
@@ -408,36 +422,41 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
// Work out how big we would have expected the frame to be at this Q given
// the current correction factor.
// Stay in double to avoid int overflow when values are large
- projected_size_based_on_q = estimate_bits_at_q(cm->frame_type,
- cm->base_qindex, cm->MBs,
- rate_correction_factor,
- cm->bit_depth);
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+ projected_size_based_on_q =
+ vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+ } else {
+ projected_size_based_on_q = vp9_estimate_bits_at_q(cpi->common.frame_type,
+ cm->base_qindex,
+ cm->MBs,
+ rate_correction_factor,
+ cm->bit_depth);
+ }
// Work out a size correction factor.
if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
- correction_factor = (100 * cpi->rc.projected_frame_size) /
- projected_size_based_on_q;
+ correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+ projected_size_based_on_q);
// More heavily damped adjustment used if we have been oscillating either side
// of target.
- switch (damp_var) {
- case 0:
- adjustment_limit = 0.75;
- break;
- case 1:
- adjustment_limit = 0.375;
- break;
- case 2:
- default:
- adjustment_limit = 0.25;
- break;
- }
+ adjustment_limit = 0.25 +
+ 0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
+
+ cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+ cpi->rc.q_1_frame = cm->base_qindex;
+ cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+ if (correction_factor > 110)
+ cpi->rc.rc_1_frame = -1;
+ else if (correction_factor < 90)
+ cpi->rc.rc_1_frame = 1;
+ else
+ cpi->rc.rc_1_frame = 0;
if (correction_factor > 102) {
// We are not already at the worst allowable quality
correction_factor = (int)(100 + ((correction_factor - 100) *
adjustment_limit));
rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
-
// Keep rate_correction_factor within limits
if (rate_correction_factor > MAX_BPB_FACTOR)
rate_correction_factor = MAX_BPB_FACTOR;
@@ -461,7 +480,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
const VP9_COMMON *const cm = &cpi->common;
int q = active_worst_quality;
int last_error = INT_MAX;
- int i, target_bits_per_mb;
+ int i, target_bits_per_mb, bits_per_mb_at_this_q;
const double correction_factor = get_rate_correction_factor(cpi);
// Calculate required scaling factor based on target frame size and size of
@@ -472,9 +491,14 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
i = active_best_quality;
do {
- const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
- correction_factor,
- cm->bit_depth);
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ bits_per_mb_at_this_q =
+ (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+ } else {
+ bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
+ correction_factor,
+ cm->bit_depth);
+ }
if (bits_per_mb_at_this_q <= target_bits_per_mb) {
if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -488,6 +512,14 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
}
} while (++i <= active_worst_quality);
+ // In CBR mode, this makes sure q is between oscillating Qs to prevent
+ // resonance.
+ if (cpi->oxcf.rc_mode == VPX_CBR &&
+ (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+ cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+ q = clamp(q, MIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+ MAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+ }
return q;
}
@@ -557,18 +589,23 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
const VP9_COMMON *const cm = &cpi->common;
const RATE_CONTROL *rc = &cpi->rc;
// Buffer level below which we push active_worst to worst_quality.
- int64_t critical_level = rc->optimal_buffer_level >> 2;
+ int64_t critical_level = rc->optimal_buffer_level >> 3;
int64_t buff_lvl_step = 0;
int adjustment = 0;
int active_worst_quality;
+ int ambient_qp;
if (cm->frame_type == KEY_FRAME)
- return rc->worst_quality * 4 / 5;
- if (cm->current_video_frame > 1)
- active_worst_quality = MIN(rc->worst_quality,
- rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
- else
- active_worst_quality = MIN(rc->worst_quality,
- rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
+ return rc->worst_quality;
+ // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+ // for the first few frames following key frame. These are both initialized
+ // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+ // So for first few frames following key, the qp of that key frame is weighted
+ // into the active_worst_quality setting.
+ ambient_qp = (cm->current_video_frame < 5) ?
+ MIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) :
+ rc->avg_frame_qindex[INTER_FRAME];
+ active_worst_quality = MIN(rc->worst_quality,
+ ambient_qp * 5 / 4);
if (rc->buffer_level > rc->optimal_buffer_level) {
// Adjust down.
// Maximum limit for down adjustment, ~30%.
@@ -586,12 +623,11 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
if (critical_level) {
buff_lvl_step = (rc->optimal_buffer_level - critical_level);
if (buff_lvl_step) {
- adjustment =
- (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
- (rc->optimal_buffer_level - rc->buffer_level) /
- buff_lvl_step);
+ adjustment = (int)((rc->worst_quality - ambient_qp) *
+ (rc->optimal_buffer_level - rc->buffer_level) /
+ buff_lvl_step);
}
- active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
+ active_worst_quality = ambient_qp + adjustment;
}
} else {
// Set to worst_quality if buffer is below critical level.
@@ -720,7 +756,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
static int get_active_cq_level(const RATE_CONTROL *rc,
const VP9EncoderConfig *const oxcf) {
- static const double cq_adjust_threshold = 0.5;
+ static const double cq_adjust_threshold = 0.1;
int active_cq_level = oxcf->cq_level;
if (oxcf->rc_mode == VPX_CQ &&
rc->total_target_bits > 0) {
@@ -883,6 +919,23 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
return q;
}
+int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
+ static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+ 1.00, // INTER_NORMAL
+ 1.00, // INTER_HIGH
+ 1.50, // GF_ARF_LOW
+ 1.75, // GF_ARF_STD
+ 2.00, // KF_STD
+ };
+ static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+ {INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME};
+ const VP9_COMMON *const cm = &cpi->common;
+ int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level],
+ q, rate_factor_deltas[rf_level],
+ cm->bit_depth);
+ return qdelta;
+}
+
#define STATIC_MOTION_THRESH 95
static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
int *bottom_index,
@@ -890,6 +943,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
const VP9_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
const int cq_level = get_active_cq_level(rc, oxcf);
int active_best_quality;
int active_worst_quality = cpi->twopass.active_worst_quality;
@@ -971,7 +1025,12 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
if (!cpi->refresh_alt_ref_frame) {
active_best_quality = cq_level;
} else {
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+ // Modify best quality for second level arfs. For mode VPX_Q this
+ // becomes the baseline frame q.
+ if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
}
} else {
active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
@@ -991,9 +1050,9 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
}
}
- // Extenstion to max or min Q if undershoot or overshoot is outside
+ // Extension to max or min Q if undershoot or overshoot is outside
// the permitted range.
- if ((cpi->oxcf.rc_mode == VPX_VBR) &&
+ if ((cpi->oxcf.rc_mode != VPX_Q) &&
(cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
if (frame_is_intra_only(cm) ||
(!rc->is_src_frame_alt_ref &&
@@ -1012,25 +1071,21 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) ||
!rc->this_key_frame_forced ||
(cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
- 1.00, // INTER_NORMAL
- 1.00, // INTER_HIGH
- 1.50, // GF_ARF_LOW
- 1.75, // GF_ARF_STD
- 2.00, // KF_STD
- };
- const double rate_factor =
- rate_factor_deltas[gf_group->rf_level[gf_group->index]];
- int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
- active_worst_quality, rate_factor,
- cm->bit_depth);
- active_worst_quality = active_worst_quality + qdelta;
- active_worst_quality = MAX(active_worst_quality, active_best_quality);
+ int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+ active_worst_quality);
+ active_worst_quality = MAX(active_worst_quality + qdelta,
+ active_best_quality);
}
#endif
- // Clip the active best and worst quality values to limits.
+ // Modify active_best_quality for downscaled normal frames.
+ if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+ int qdelta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
+ active_best_quality, 2.0,
+ cm->bit_depth);
+ active_best_quality = MAX(active_best_quality + qdelta, rc->best_quality);
+ }
+
active_best_quality = clamp(active_best_quality,
rc->best_quality, rc->worst_quality);
active_worst_quality = clamp(active_worst_quality,
@@ -1117,6 +1172,12 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
rc->this_frame_target = target;
+ // Modify frame size target when down-scaling.
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+ rc->frame_size_selector != UNSCALED)
+ rc->this_frame_target = (int)(rc->this_frame_target
+ * rate_thresh_mult[rc->frame_size_selector]);
+
// Target rate per SB64 (including partial SB64s.
rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
(cm->width * cm->height);
@@ -1169,13 +1230,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
RATE_CONTROL *const rc = &cpi->rc;
const int qindex = cm->base_qindex;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ vp9_cyclic_refresh_postencode(cpi);
+ }
+
// Update rate control heuristics
rc->projected_frame_size = (int)(bytes_used << 3);
// Post encode loop adjustment of Q prediction.
- vp9_rc_update_rate_correction_factors(
- cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF ||
- oxcf->rc_mode == VPX_CBR) ? 2 : 0);
+ vp9_rc_update_rate_correction_factors(cpi);
// Keep a record of last Q and ambient average Q.
if (cm->frame_type == KEY_FRAME) {
@@ -1205,7 +1268,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
// better than that already stored.
// This is used to help set quality in forced key frames to reduce popping
if ((qindex < rc->last_boosted_qindex) ||
- (((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+ (cm->frame_type == KEY_FRAME) ||
+ (!rc->constrained_gf_group &&
+ (cpi->refresh_alt_ref_frame ||
(cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
rc->last_boosted_qindex = qindex;
}
@@ -1247,14 +1312,20 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
rc->frames_since_key++;
rc->frames_to_key--;
}
+
+ // Trigger the resizing of the next frame if it is scaled.
+ cpi->resize_pending =
+ rc->next_frame_size_selector != rc->frame_size_selector;
+ rc->frame_size_selector = rc->next_frame_size_selector;
}
void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
// Update buffer level with zero size, update frame counters, and return.
update_buffer_level(cpi, 0);
- cpi->common.last_frame_type = cpi->common.frame_type;
cpi->rc.frames_since_key++;
cpi->rc.frames_to_key--;
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
}
// Use this macro to turn on/off use of alt-refs in one-pass mode.
@@ -1307,8 +1378,12 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
// NOTE: frames_till_gf_update_due must be <= frames_to_key.
- if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ if (rc->frames_till_gf_update_due > rc->frames_to_key) {
rc->frames_till_gf_update_due = rc->frames_to_key;
+ rc->constrained_gf_group = 1;
+ } else {
+ rc->constrained_gf_group = 0;
+ }
cpi->refresh_golden_frame = 1;
rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -1327,7 +1402,18 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
- int target = rc->avg_frame_bandwidth;
+ int target;
+
+ if (oxcf->gf_cbr_boost_pct) {
+ const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+ target = cpi->refresh_golden_frame ?
+ (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) :
+ (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = rc->avg_frame_bandwidth;
+ }
if (svc->number_temporal_layers > 1 &&
oxcf->rc_mode == VPX_CBR) {
// Note that for layers, avg_frame_bandwidth is the cumulative
@@ -1347,6 +1433,11 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
target += (target * pct_high) / 200;
}
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate = rc->avg_frame_bandwidth *
+ oxcf->rc_max_inter_bitrate_pct / 100;
+ target = MIN(target, max_rate);
+ }
return MAX(min_frame_target, target);
}
@@ -1416,6 +1507,12 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
target = calc_pframe_target_size_one_pass_cbr(cpi);
}
}
+
+ // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+ // should be done here, before the frame qp is selected.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_update_parameters(cpi);
+
vp9_rc_set_frame_target(cpi, target);
rc->frames_till_gf_update_due = INT_MAX;
rc->baseline_gf_interval = INT_MAX;
@@ -1436,15 +1533,33 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
rc->frames_to_key = cpi->oxcf.key_freq;
rc->kf_boost = DEFAULT_KF_BOOST;
rc->source_alt_ref_active = 0;
- target = calc_iframe_target_size_one_pass_cbr(cpi);
} else {
cm->frame_type = INTER_FRAME;
- target = calc_pframe_target_size_one_pass_cbr(cpi);
}
+ if (rc->frames_till_gf_update_due == 0) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_set_golden_update(cpi);
+ else
+ rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+
+ // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+ // should be done here, before the frame qp is selected.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_update_parameters(cpi);
+
+ if (cm->frame_type == KEY_FRAME)
+ target = calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = calc_pframe_target_size_one_pass_cbr(cpi);
+
vp9_rc_set_frame_target(cpi, target);
- // Don't use gf_update by default in CBR mode.
- rc->frames_till_gf_update_due = INT_MAX;
- rc->baseline_gf_interval = INT_MAX;
}
int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1485,19 +1600,30 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
// Convert the q target to an index
for (i = rc->best_quality; i < rc->worst_quality; ++i) {
- target_index = i;
- if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= target_bits_per_mb)
+ if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+ target_bits_per_mb) {
+ target_index = i;
break;
+ }
}
-
return target_index - qindex;
}
-void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi,
- RATE_CONTROL *const rc) {
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
+ RATE_CONTROL *const rc) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- // Set Maximum gf/arf interval
- rc->max_gf_interval = 16;
+
+ // Set a minimum interval.
+ rc->min_gf_interval =
+ MIN(MAX_GF_INTERVAL, MAX(MIN_GF_INTERVAL, (int)(cpi->framerate * 0.125)));
+
+ // Set Maximum gf/arf interval.
+ rc->max_gf_interval =
+ MIN(MAX_GF_INTERVAL, (int)(cpi->framerate * 0.75));
+ // Round up to next even number if odd.
+ rc->max_gf_interval += (rc->max_gf_interval & 0x01);
// Extended interval for genuinely static scenes
rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
@@ -1509,6 +1635,9 @@ void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi,
if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = MIN(rc->min_gf_interval, rc->max_gf_interval);
}
void vp9_rc_update_framerate(VP9_COMP *cpi) {
@@ -1535,5 +1664,45 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
vbr_max_bits);
- vp9_rc_set_gf_max_interval(cpi, rc);
+ vp9_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(VP9_COMP *cpi,
+ int *this_frame_target,
+ int64_t vbr_bits_off_target) {
+ int max_delta;
+ double position_factor = 1.0;
+
+ // How far through the clip are we.
+ // This number is used to damp the per frame rate correction.
+ // Range 0 - 1.0
+ if (cpi->twopass.total_stats.count) {
+ position_factor = sqrt((double)cpi->common.current_video_frame /
+ cpi->twopass.total_stats.count);
+ }
+ max_delta = (int)(position_factor *
+ ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+ // vbr_bits_off_target > 0 means we have extra bits to spend
+ if (vbr_bits_off_target > 0) {
+ *this_frame_target +=
+ (vbr_bits_off_target > max_delta) ? max_delta
+ : (int)vbr_bits_off_target;
+ } else {
+ *this_frame_target -=
+ (vbr_bits_off_target < -max_delta) ? max_delta
+ : (int)-vbr_bits_off_target;
+ }
+}
+
+void vp9_set_target_rate(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target_rate = rc->base_frame_target;
+
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+ vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
+ vp9_rc_set_frame_target(cpi, target_rate);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
index bc74129e591..869f6e59e97 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -33,6 +33,27 @@ typedef enum {
RATE_FACTOR_LEVELS = 5
} RATE_FACTOR_LEVEL;
+// Internal frame scaling level.
+typedef enum {
+ UNSCALED = 0, // Frame is unscaled.
+ SCALE_STEP1 = 1, // First-level down-scaling.
+ FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = {16, 24};
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
typedef struct {
// Rate targetting variables
int base_frame_target; // A baseline frame target before adjustment
@@ -52,9 +73,11 @@ typedef struct {
int frames_since_golden;
int frames_till_gf_update_due;
+ int min_gf_interval;
int max_gf_interval;
int static_scene_max_gf_interval;
int baseline_gf_interval;
+ int constrained_gf_group;
int frames_to_key;
int frames_since_key;
int this_key_frame_forced;
@@ -99,7 +122,22 @@ typedef struct {
int64_t starting_buffer_level;
int64_t optimal_buffer_level;
int64_t maximum_buffer_size;
- // int active_best_quality;
+
+ // rate control history for last frame(1) and the frame before(2).
+ // -1: undershot
+ // 1: overshoot
+ // 0: not initialized.
+ int rc_1_frame;
+ int rc_2_frame;
+ int q_1_frame;
+ int q_2_frame;
+
+ // Auto frame-scaling variables.
+ FRAME_SCALE_LEVEL frame_size_selector;
+ FRAME_SCALE_LEVEL next_frame_size_selector;
+ int frame_width[FRAME_SCALE_STEPS];
+ int frame_height[FRAME_SCALE_STEPS];
+ int rf_level_maxq[RATE_FACTOR_LEVELS];
} RATE_CONTROL;
struct VP9_COMP;
@@ -108,6 +146,10 @@ struct VP9EncoderConfig;
void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
RATE_CONTROL *rc);
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+ double correction_factor,
+ vpx_bit_depth_t bit_depth);
+
double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
void vp9_rc_init_minq_luts();
@@ -148,7 +190,7 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
// Updates rate correction factors
// Changes only the rate correction factors in the rate control structure.
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
// Decide if we should drop this frame: For 1-pass CBR.
// Changes only the decimation count in the rate control structure
@@ -193,10 +235,14 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
int qindex, double rate_target_ratio,
vpx_bit_depth_t bit_depth);
+int vp9_frame_type_qdelta(const struct VP9_COMP *cpi, int rf_level, int q);
+
void vp9_rc_update_framerate(struct VP9_COMP *cpi);
-void vp9_rc_set_gf_max_interval(const struct VP9_COMP *const cpi,
- RATE_CONTROL *const rc);
+void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
+ RATE_CONTROL *const rc);
+
+void vp9_set_target_rate(struct VP9_COMP *cpi);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
index 7f526fc4234..194001c51a2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
@@ -65,7 +65,7 @@ static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
};
static void fill_mode_costs(VP9_COMP *cpi) {
- const FRAME_CONTEXT *const fc = &cpi->common.fc;
+ const FRAME_CONTEXT *const fc = cpi->common.fc;
int i, j;
for (i = 0; i < INTRA_MODES; ++i)
@@ -204,27 +204,28 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
}
-void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
+void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
#if CONFIG_VP9_HIGHBITDEPTH
switch (cpi->common.bit_depth) {
case VPX_BITS_8:
- cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
- cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
break;
case VPX_BITS_10:
- cpi->mb.sadperbit16 = sad_per_bit16lut_10[qindex];
- cpi->mb.sadperbit4 = sad_per_bit4lut_10[qindex];
+ x->sadperbit16 = sad_per_bit16lut_10[qindex];
+ x->sadperbit4 = sad_per_bit4lut_10[qindex];
break;
case VPX_BITS_12:
- cpi->mb.sadperbit16 = sad_per_bit16lut_12[qindex];
- cpi->mb.sadperbit4 = sad_per_bit4lut_12[qindex];
+ x->sadperbit16 = sad_per_bit16lut_12[qindex];
+ x->sadperbit4 = sad_per_bit4lut_12[qindex];
break;
default:
assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
}
#else
- cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
- cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+ (void)cpi;
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
#endif // CONFIG_VP9_HIGHBITDEPTH
}
@@ -262,7 +263,7 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
void vp9_initialize_rd_consts(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
RD_OPT *const rd = &cpi->rd;
int i;
@@ -279,9 +280,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
set_block_thresholds(cm, rd);
- if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
- fill_token_costs(x->token_costs, cm->fc.coef_probs);
+ if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
+ fill_token_costs(x->token_costs, cm->fc->coef_probs);
+ if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+ cm->frame_type == KEY_FRAME) {
for (i = 0; i < PARTITION_CONTEXTS; ++i)
vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
vp9_partition_tree);
@@ -295,11 +298,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
vp9_build_nmv_cost_table(x->nmvjointcost,
cm->allow_high_precision_mv ? x->nmvcost_hp
: x->nmvcost,
- &cm->fc.nmvc, cm->allow_high_precision_mv);
+ &cm->fc->nmvc, cm->allow_high_precision_mv);
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
- cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+ cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
}
}
}
@@ -379,7 +382,7 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
*d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
}
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
unsigned int qstep, int *rate,
int64_t *dist) {
// This function models the rate and distortion for a Laplacian
@@ -395,10 +398,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
int d_q10, r_q10;
static const uint32_t MAX_XSQ_Q10 = 245727;
const uint64_t xsq_q10_64 =
- ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
model_rd_norm(xsq_q10, &r_q10, &d_q10);
- *rate = (n * r_q10 + 2) >> 2;
+ *rate = ((r_q10 << n_log2) + 2) >> 2;
*dist = (var * (int64_t)d_q10 + 512) >> 10;
}
}
@@ -416,8 +419,8 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
int i;
switch (tx_size) {
case TX_4X4:
- vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
- vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
break;
case TX_8X8:
for (i = 0; i < num_4x4_w; i += 2)
@@ -447,41 +450,47 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
uint8_t *ref_y_buffer, int ref_y_stride,
int ref_frame, BLOCK_SIZE block_size) {
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
int i;
int zero_seen = 0;
int best_index = 0;
int best_sad = INT_MAX;
int this_sad = INT_MAX;
int max_mv = 0;
+ int near_same_nearest;
uint8_t *src_y_ptr = x->plane[0].src.buf;
uint8_t *ref_y_ptr;
const int num_mv_refs = MAX_MV_REF_CANDIDATES +
(cpi->sf.adaptive_motion_search &&
- block_size < cpi->sf.max_partition_size);
+ block_size < x->max_partition_size);
MV pred_mv[3];
pred_mv[0] = mbmi->ref_mvs[ref_frame][0].as_mv;
pred_mv[1] = mbmi->ref_mvs[ref_frame][1].as_mv;
pred_mv[2] = x->pred_mv[ref_frame];
+ assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+ near_same_nearest =
+ mbmi->ref_mvs[ref_frame][0].as_int == mbmi->ref_mvs[ref_frame][1].as_int;
// Get the sad for each candidate reference mv.
for (i = 0; i < num_mv_refs; ++i) {
const MV *this_mv = &pred_mv[i];
+ int fp_row, fp_col;
- max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
- if (is_zero_mv(this_mv) && zero_seen)
+ if (i == 1 && near_same_nearest)
continue;
+ fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+ fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+ max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
- zero_seen |= is_zero_mv(this_mv);
-
- ref_y_ptr =
- &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)];
+ if (fp_row ==0 && fp_col == 0 && zero_seen)
+ continue;
+ zero_seen |= (fp_row ==0 && fp_col == 0);
+ ref_y_ptr =&ref_y_buffer[ref_y_stride * fp_row + fp_col];
// Find sad for current vector.
this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
ref_y_ptr, ref_y_stride);
-
// Note if it is the best so far.
if (this_sad < best_sad) {
best_sad = this_sad;
@@ -516,17 +525,32 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
}
}
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
- int ref_frame) {
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+ int raster_block, int stride) {
+ const int bw = b_width_log2_lookup[plane_bsize];
+ const int y = 4 * (raster_block >> bw);
+ const int x = 4 * (raster_block & ((1 << bw) - 1));
+ return y * stride + x;
+}
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+ int raster_block, int16_t *base) {
+ const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+ int ref_frame) {
const VP9_COMMON *const cm = &cpi->common;
- const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
- return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
+ const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return
+ (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
+ &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
}
-int vp9_get_switchable_rate(const VP9_COMP *cpi) {
- const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const int ctx = vp9_get_pred_context_switchable_interp(xd);
return SWITCHABLE_INTERP_RATE_FACTOR *
cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
@@ -557,10 +581,6 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
rd->thresh_mult[THR_NEWA] += 1000;
rd->thresh_mult[THR_NEWG] += 1000;
- // Adjust threshold only in real time mode, which only uses last
- // reference frame.
- rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
-
rd->thresh_mult[THR_NEARMV] += 1000;
rd->thresh_mult[THR_NEARA] += 1000;
rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
@@ -591,24 +611,34 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
}
void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
- const SPEED_FEATURES *const sf = &cpi->sf;
+ static const int thresh_mult[2][MAX_REFS] =
+ {{2500, 2500, 2500, 4500, 4500, 2500},
+ {2000, 2000, 2000, 4000, 4000, 2000}};
RD_OPT *const rd = &cpi->rd;
- int i;
-
- for (i = 0; i < MAX_REFS; ++i)
- rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
-
- rd->thresh_mult_sub8x8[THR_LAST] += 2500;
- rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
- rd->thresh_mult_sub8x8[THR_ALTR] += 2500;
- rd->thresh_mult_sub8x8[THR_INTRA] += 2500;
- rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
- rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
+ const int idx = cpi->oxcf.mode == BEST;
+ memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
+}
- // Check for masked out split cases.
- for (i = 0; i < MAX_REFS; ++i)
- if (sf->disable_split_mask & (1 << i))
- rd->thresh_mult_sub8x8[i] = INT_MAX;
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+ int bsize, int best_mode_index) {
+ if (rd_thresh > 0) {
+ const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+ int mode;
+ for (mode = 0; mode < top_mode; ++mode) {
+ const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+ const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+ BLOCK_SIZE bs;
+ for (bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &factor_buf[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> 4);
+ } else {
+ *fact = MIN(*fact + RD_THRESH_INC,
+ rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+ }
+ }
}
int vp9_get_intra_cost_penalty(int qindex, int qdelta,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
index 1aa52663a60..4d247342b0a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
@@ -36,6 +36,9 @@ extern "C" {
#define MAX_MODES 30
#define MAX_REFS 6
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+
// This enumerator type needs to be kept aligned with the mode order in
// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
typedef enum {
@@ -98,20 +101,12 @@ typedef struct RD_OPT {
int thresh_mult_sub8x8[MAX_REFS];
int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
- int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-
- int mode_map[BLOCK_SIZES][MAX_MODES];
- int64_t comp_pred_diff[REFERENCE_MODES];
int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
- int64_t tx_select_diff[TX_MODES];
// TODO(agrange): can this overflow?
int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
- int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
- int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
- int64_t mask_filter;
int RDMULT;
int RDDIV;
@@ -129,6 +124,7 @@ void vp9_rd_cost_reset(RD_COST *rd_cost);
void vp9_rd_cost_init(RD_COST *rd_cost);
struct TileInfo;
+struct TileDataEnc;
struct VP9_COMP;
struct macroblock;
@@ -136,16 +132,23 @@ int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
-void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex);
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
unsigned int qstep, int *rate,
int64_t *dist);
-int vp9_get_switchable_rate(const struct VP9_COMP *cpi);
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
+ const MACROBLOCKD *const xd);
+
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+ int raster_block, int stride);
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
- int ref_frame);
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+ int raster_block, int16_t *base);
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
+ int ref_frame);
void vp9_init_me_luts();
@@ -158,6 +161,9 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
+void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
+ int bsize, int best_mode_index);
+
static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
int thresh_fact) {
return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
index eca8e588092..73825623748 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
@@ -37,9 +37,7 @@
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_variance.h"
-
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC 1
+#include "vp9/encoder/vp9_aq_variance.h"
#define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
(1 << INTRA_FRAME))
@@ -51,6 +49,7 @@
#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
#define MIN_EARLY_TERM_INDEX 3
+#define NEW_MV_DISCOUNT_FACTOR 8
typedef struct {
PREDICTION_MODE mode;
@@ -78,6 +77,7 @@ struct rdcost_block_args {
const scan_order *so;
};
+#define LAST_NEW_MV_INDEX 6
static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
{NEARESTMV, {LAST_FRAME, NONE}},
{NEARESTMV, {ALTREF_FRAME, NONE}},
@@ -129,19 +129,6 @@ static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
{{INTRA_FRAME, NONE}},
};
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
- int raster_block, int stride) {
- const int bw = b_width_log2_lookup[plane_bsize];
- const int y = 4 * (raster_block >> bw);
- const int x = 4 * (raster_block & ((1 << bw) - 1));
- return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
- int raster_block, int16_t *base) {
- const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
- return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int m, int n, int min_plane, int max_plane) {
int i;
@@ -177,7 +164,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
int i;
int64_t rate_sum = 0;
int64_t dist_sum = 0;
- const int ref = xd->mi[0].src_mi->mbmi.ref_frame[0];
+ const int ref = xd->mi[0]->mbmi.ref_frame[0];
unsigned int sse;
unsigned int var = 0;
unsigned int sum_sse = 0;
@@ -268,15 +255,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
} else {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> (xd->bd - 5),
&rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
rate_sum += rate;
@@ -305,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
return error;
}
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ int i;
+ int64_t error = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ }
+
+ return error;
+}
#if CONFIG_VP9_HIGHBITDEPTH
int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
@@ -341,14 +340,14 @@ static const int16_t band_counts[TX_SIZES][8] = {
{ 1, 2, 3, 4, 11, 256 - 21, 0 },
{ 1, 2, 3, 4, 11, 1024 - 21, 0 },
};
-static INLINE int cost_coeffs(MACROBLOCK *x,
- int plane, int block,
- ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
- TX_SIZE tx_size,
- const int16_t *scan, const int16_t *nb,
- int use_fast_coef_costing) {
+static int cost_coeffs(MACROBLOCK *x,
+ int plane, int block,
+ ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+ TX_SIZE tx_size,
+ const int16_t *scan, const int16_t *nb,
+ int use_fast_coef_costing) {
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
const struct macroblock_plane *p = &x->plane[plane];
const struct macroblockd_plane *pd = &xd->plane[plane];
const PLANE_TYPE type = pd->plane_type;
@@ -360,6 +359,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
uint8_t token_cache[32 * 32];
int pt = combine_entropy_contexts(*A, *L);
int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
// Check for consistency of tx_size with mode info
assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
: get_uv_tx_size(mbmi, pd) == tx_size);
@@ -373,23 +378,29 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
// dc token
int v = qcoeff[0];
- int prev_t = vp9_dct_value_tokens_ptr[v].token;
- cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+ int16_t prev_t;
+ EXTRABIT e;
+ vp9_get_token_extra(v, &prev_t, &e);
+ cost = (*token_costs)[0][pt][prev_t] +
+ vp9_get_cost(prev_t, e, cat6_high_cost);
+
token_cache[0] = vp9_pt_energy_class[prev_t];
++token_costs;
// ac tokens
for (c = 1; c < eob; c++) {
const int rc = scan[c];
- int t;
+ int16_t t;
v = qcoeff[rc];
- t = vp9_dct_value_tokens_ptr[v].token;
+ vp9_get_token_extra(v, &t, &e);
if (use_fast_coef_costing) {
- cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+ cost += (*token_costs)[!prev_t][!prev_t][t] +
+ vp9_get_cost(t, e, cat6_high_cost);
} else {
pt = get_coef_context(nb, token_cache, c);
- cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+ cost += (*token_costs)[!prev_t][pt][t] +
+ vp9_get_cost(t, e, cat6_high_cost);
token_cache[rc] = vp9_pt_energy_class[t];
}
prev_t = t;
@@ -441,7 +452,7 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
#endif // CONFIG_VP9_HIGHBITDEPTH
args->sse = this_sse >> shift;
- if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
+ if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
// TODO(jingning): tune the model to better capture the distortion.
int64_t p = (pd->dequant[1] * pd->dequant[1] *
(1 << ss_txfrm_size)) >> (shift + 2);
@@ -471,14 +482,15 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
int64_t rd1, rd2, rd;
if (args->skip)
return;
if (!is_inter_block(mbmi)) {
- vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
+ struct encode_b_args arg = {x, NULL, &mbmi->skip};
+ vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
dist_block(plane, block, tx_size, args, xd->bd);
@@ -509,8 +521,9 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
if (x->plane[plane].eobs[block]) {
- int64_t dc_correct = coeff[0] * coeff[0] -
- (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0]);
+ const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
+ const int64_t resd_sse = coeff[0] - dqcoeff[0];
+ int64_t dc_correct = orig_sse - resd_sse * resd_sse;
#if CONFIG_VP9_HIGHBITDEPTH
dc_correct >>= ((xd->bd - 8) * 2);
#endif
@@ -575,7 +588,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
args.use_fast_coef_costing = use_fast_coef_casting;
if (plane == 0)
- xd->mi[0].src_mi->mbmi.tx_size = tx_size;
+ xd->mi[0]->mbmi.tx_size = tx_size;
vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
@@ -605,7 +618,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
VP9_COMMON *const cm = &cpi->common;
const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
@@ -625,7 +638,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
int r[TX_SIZES][2], s[TX_SIZES];
int64_t d[TX_SIZES], sse[TX_SIZES];
@@ -639,7 +652,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_rd = INT64_MAX;
TX_SIZE best_tx = max_tx_size;
- const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
+ const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
assert(skip_prob > 0);
s0 = vp9_cost_bit(skip_prob, 0);
s1 = vp9_cost_bit(skip_prob, 1);
@@ -712,10 +725,10 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
int64_t sse;
int64_t *ret_sse = psse ? psse : &sse;
- assert(bs == xd->mi[0].src_mi->mbmi.sb_type);
+ assert(bs == xd->mi[0]->mbmi.sb_type);
if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
- vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
+ memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
bs);
} else {
@@ -760,10 +773,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
struct macroblockd_plane *pd = &xd->plane[0];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
- const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
- src_stride)];
- uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
- dst_stride)];
+ const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+ src_stride)];
+ uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+ dst_stride)];
ENTROPY_CONTEXT ta[2], tempa[2];
ENTROPY_CONTEXT tl[2], templ[2];
@@ -777,9 +790,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
assert(ib < 4);
- vpx_memcpy(ta, a, sizeof(ta));
- vpx_memcpy(tl, l, sizeof(tl));
- xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
+ memcpy(ta, a, sizeof(ta));
+ memcpy(tl, l, sizeof(tl));
+ xd->mi[0]->mbmi.tx_size = TX_4X4;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -799,18 +812,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
continue;
}
- vpx_memcpy(tempa, ta, sizeof(ta));
- vpx_memcpy(templ, tl, sizeof(tl));
+ memcpy(tempa, ta, sizeof(ta));
+ memcpy(templ, tl, sizeof(tl));
for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
const int block = ib + idy * 2 + idx;
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
- int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
- p->src_diff);
+ int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
+ block,
+ p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
- xd->mi[0].src_mi->bmi[block].as_mode = mode;
+ xd->mi[0]->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, block, 1,
TX_4X4, mode,
x->skip_encode ? src : dst,
@@ -859,12 +873,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
*bestdistortion = distortion;
best_rd = this_rd;
*best_mode = mode;
- vpx_memcpy(a, tempa, sizeof(tempa));
- vpx_memcpy(l, templ, sizeof(templ));
+ memcpy(a, tempa, sizeof(tempa));
+ memcpy(l, templ, sizeof(templ));
for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
- vpx_memcpy(best_dst16 + idy * 8,
- CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
- num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+ memcpy(best_dst16 + idy * 8,
+ CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ num_4x4_blocks_wide * 4 * sizeof(uint16_t));
}
}
next_highbd:
@@ -874,9 +888,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
return best_rd;
for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
- vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
- best_dst16 + idy * 8,
- num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+ memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ best_dst16 + idy * 8,
+ num_4x4_blocks_wide * 4 * sizeof(uint16_t));
}
return best_rd;
@@ -899,18 +913,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
continue;
}
- vpx_memcpy(tempa, ta, sizeof(ta));
- vpx_memcpy(templ, tl, sizeof(tl));
+ memcpy(tempa, ta, sizeof(ta));
+ memcpy(templ, tl, sizeof(tl));
for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
const int block = ib + idy * 2 + idx;
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
- int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
- p->src_diff);
+ int16_t *const src_diff =
+ vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
- xd->mi[0].src_mi->bmi[block].as_mode = mode;
+ xd->mi[0]->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, block, 1,
TX_4X4, mode,
x->skip_encode ? src : dst,
@@ -957,11 +971,11 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
*bestdistortion = distortion;
best_rd = this_rd;
*best_mode = mode;
- vpx_memcpy(a, tempa, sizeof(tempa));
- vpx_memcpy(l, templ, sizeof(templ));
+ memcpy(a, tempa, sizeof(tempa));
+ memcpy(l, templ, sizeof(templ));
for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
- vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
- num_4x4_blocks_wide * 4);
+ memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+ num_4x4_blocks_wide * 4);
}
next:
{}
@@ -971,8 +985,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
return best_rd;
for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
- vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
- num_4x4_blocks_wide * 4);
+ memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+ num_4x4_blocks_wide * 4);
return best_rd;
}
@@ -983,10 +997,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
int64_t best_rd) {
int i, j;
const MACROBLOCKD *const xd = &mb->e_mbd;
- MODE_INFO *const mic = xd->mi[0].src_mi;
- const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
- const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
- const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
+ MODE_INFO *const mic = xd->mi[0];
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
@@ -997,8 +1011,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
ENTROPY_CONTEXT t_above[4], t_left[4];
const int *bmode_costs = cpi->mbmode_cost;
- vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
- vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
+ memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
+ memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
// Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
@@ -1054,14 +1068,14 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
PREDICTION_MODE mode;
PREDICTION_MODE mode_selected = DC_PRED;
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *const mic = xd->mi[0].src_mi;
+ MODE_INFO *const mic = xd->mi[0];
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion, this_rd;
TX_SIZE best_tx = TX_4X4;
int i;
int *bmode_costs;
- const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
- const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
bmode_costs = cpi->y_mode_costs[A][L];
@@ -1070,10 +1084,20 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < TX_MODES; i++)
tx_cache[i] = INT64_MAX;
- vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
/* Y Search for intra prediction mode */
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
int64_t local_tx_cache[TX_MODES];
+
+ if (cpi->sf.use_nonrd_pick_mode) {
+ // These speed features are turned on in hybrid non-RD and RD mode
+ // for key frame coding in the context of real-time setting.
+ if (conditional_skipintra(mode, mode_selected))
+ continue;
+ if (*skippable)
+ break;
+ }
+
mic->mbmi.mode = mode;
super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
@@ -1119,7 +1143,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
int64_t *sse, BLOCK_SIZE bsize,
int64_t ref_best_rd) {
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
int plane;
int pnrate = 0, pnskip = 1;
@@ -1177,12 +1201,12 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int this_rate_tokenonly, this_rate, s;
int64_t this_distortion, this_sse;
- vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
continue;
- xd->mi[0].src_mi->mbmi.uv_mode = mode;
+ xd->mi[0]->mbmi.uv_mode = mode;
if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
&this_distortion, &s, &this_sse, bsize, best_rd))
@@ -1203,7 +1227,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- xd->mi[0].src_mi->mbmi.uv_mode = mode_selected;
+ xd->mi[0]->mbmi.uv_mode = mode_selected;
return best_rd;
}
@@ -1214,21 +1238,20 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
const VP9_COMMON *cm = &cpi->common;
int64_t unused;
- x->e_mbd.mi[0].src_mi->mbmi.uv_mode = DC_PRED;
- vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
super_block_uvrd(cpi, x, rate_tokenonly, distortion,
skippable, &unused, bsize, INT64_MAX);
*rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
}
-static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
+ PICK_MODE_CONTEXT *ctx,
BLOCK_SIZE bsize, TX_SIZE max_tx_size,
int *rate_uv, int *rate_uv_tokenonly,
int64_t *dist_uv, int *skip_uv,
PREDICTION_MODE *mode_uv) {
- MACROBLOCK *const x = &cpi->mb;
-
// Use an estimated rd for uv_intra based on DC_PRED if the
// appropriate speed flag is set.
if (cpi->sf.use_uv_intra_rd_estimate) {
@@ -1241,7 +1264,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
}
- *mode_uv = x->e_mbd.mi[0].src_mi->mbmi.uv_mode;
+ *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
}
static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
@@ -1250,20 +1273,13 @@ static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
}
-static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize,
- int_mv *frame_mv,
- int mi_row, int mi_col,
- int_mv single_newmv[MAX_REF_FRAMES],
- int *rate_mv);
-
static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
PREDICTION_MODE mode, int_mv this_mv[2],
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
int_mv seg_mvs[MAX_REF_FRAMES],
int_mv *best_ref_mv[2], const int *mvjcost,
int *mvcost[2]) {
- MODE_INFO *const mic = xd->mi[0].src_mi;
+ MODE_INFO *const mic = xd->mi[0];
const MB_MODE_INFO *const mbmi = &mic->mbmi;
int thismvcost = 0;
int idx, idy;
@@ -1305,8 +1321,7 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
for (idy = 0; idy < num_4x4_blocks_high; ++idy)
for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
- vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
- &mic->bmi[i], sizeof(mic->bmi[i]));
+ memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
thismvcost;
@@ -1325,16 +1340,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
MACROBLOCKD *xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[0];
struct macroblock_plane *const p = &x->plane[0];
- MODE_INFO *const mi = xd->mi[0].src_mi;
+ MODE_INFO *const mi = xd->mi[0];
const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
int idx, idy;
- const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
- p->src.stride)];
- uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
- pd->dst.stride)];
+ const uint8_t *const src =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ pd->dst.stride)];
int64_t thisdistortion = 0, thissse = 0;
int thisrate = 0, ref;
const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1342,7 +1357,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
for (ref = 0; ref < 1 + is_compound; ++ref) {
- const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+ const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
pd->pre[ref].stride)];
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1376,17 +1391,17 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vp9_highbd_subtract_block(
- height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride, dst, pd->dst.stride, xd->bd);
+ height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
} else {
vp9_subtract_block(
- height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride, dst, pd->dst.stride);
+ height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
}
#else
vp9_subtract_block(height, width,
- raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride, dst, pd->dst.stride);
+ vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
#endif // CONFIG_VP9_HIGHBITDEPTH
k = i;
@@ -1397,7 +1412,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
k += (idy * 2 + idx);
coeff = BLOCK_OFFSET(p->coeff, k);
- x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+ x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
#if CONFIG_VP9_HIGHBITDEPTH
@@ -1466,22 +1481,23 @@ static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
}
static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
- MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
- p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ p->src.stride)];
assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
- pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
- pd->pre[0].stride)];
+ pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ pd->pre[0].stride)];
if (has_second_ref(mbmi))
- pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
- pd->pre[1].stride)];
+ pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ pd->pre[1].stride)];
}
static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
struct buf_2d orig_pre[2]) {
- MB_MODE_INFO *mbmi = &x->e_mbd.mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
x->plane[0].src = orig_src;
x->e_mbd.plane[0].pre[0] = orig_pre[0];
if (has_second_ref(mbmi))
@@ -1529,6 +1545,190 @@ static int check_best_zero_mv(
return 1;
}
+static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ int_mv *frame_mv,
+ int mi_row, int mi_col,
+ int_mv single_newmv[MAX_REF_FRAMES],
+ int *rate_mv) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const int refs[2] = {mbmi->ref_frame[0],
+ mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
+ int_mv ref_mv[2];
+ int ite, ref;
+ const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+ struct scale_factors sf;
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ int last_besterr[2] = {INT_MAX, INT_MAX};
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+ vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+ };
+
+ // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+ uint8_t *second_pred;
+#else
+ DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ for (ref = 0; ref < 2; ++ref) {
+ ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
+
+ if (scaled_ref_frame[ref]) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL);
+ }
+
+ frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+ }
+
+ // Since we have scaled the reference frames to match the size of the current
+ // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+ cm->width, cm->height,
+ cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+ cm->width, cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Allow joint search multiple times iteratively for each reference frame
+ // and break out of the search loop if it couldn't find a better mv.
+ for (ite = 0; ite < 4; ite++) {
+ struct buf_2d ref_yv12[2];
+ int bestsme = INT_MAX;
+ int sadpb = x->sadperbit16;
+ MV tmp_mv;
+ int search_range = 3;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+ int id = ite % 2; // Even iterations search in the first reference frame,
+ // odd iterations search in the second. The predictor
+ // found for the 'other' reference frame is factored in.
+
+ // Initialized here because of compiler problem in Visual Studio.
+ ref_yv12[0] = xd->plane[0].pre[0];
+ ref_yv12[1] = xd->plane[0].pre[1];
+
+ // Get the prediction block from the 'other' reference frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+ vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, pw,
+ &frame_mv[refs[!id]].as_mv,
+ &sf, pw, ph, 0,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE,
+ xd->bd);
+ } else {
+ second_pred = (uint8_t *)second_pred_alloc_16;
+ vp9_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, pw,
+ &frame_mv[refs[!id]].as_mv,
+ &sf, pw, ph, 0,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+ }
+#else
+ vp9_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, pw,
+ &frame_mv[refs[!id]].as_mv,
+ &sf, pw, ph, 0,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Do compound motion search on the current reference frame.
+ if (id)
+ xd->plane[0].pre[0] = ref_yv12[id];
+ vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
+
+ // Use the mv result from the single mode as mv predictor.
+ tmp_mv = frame_mv[refs[id]].as_mv;
+
+ tmp_mv.col >>= 3;
+ tmp_mv.row >>= 3;
+
+ // Small-range full-pixel motion search.
+ bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+ search_range,
+ &cpi->fn_ptr[bsize],
+ &ref_mv[id].as_mv, second_pred);
+ if (bestsme < INT_MAX)
+ bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+ second_pred, &cpi->fn_ptr[bsize], 1);
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ bestsme = cpi->find_fractional_mv_step(
+ x, &tmp_mv,
+ &ref_mv[id].as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[bsize],
+ 0, cpi->sf.mv.subpel_iters_per_step,
+ NULL,
+ x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred,
+ pw, ph);
+ }
+
+ // Restore the pointer to the first (possibly scaled) prediction buffer.
+ if (id)
+ xd->plane[0].pre[0] = ref_yv12[0];
+
+ if (bestsme < last_besterr[id]) {
+ frame_mv[refs[id]].as_mv = tmp_mv;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ }
+
+ *rate_mv = 0;
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Restore the prediction frame pointers to their unscaled versions.
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+
+ *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+ &mbmi->ref_mvs[refs[ref]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+}
+
static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
const TileInfo * const tile,
int_mv *best_ref_mv,
@@ -1544,7 +1744,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
int i;
BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
MACROBLOCKD *xd = &x->e_mbd;
- MODE_INFO *mi = xd->mi[0].src_mi;
+ MODE_INFO *mi = xd->mi[0];
MB_MODE_INFO *mbmi = &mi->mbmi;
int mode_idx;
int k, br = 0, idx, idy;
@@ -1576,8 +1776,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < 4; i++)
bsi->modes[i] = ZEROMV;
- vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
- vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
+ memcpy(t_above, pd->above_context, sizeof(t_above));
+ memcpy(t_left, pd->left_context, sizeof(t_left));
// 64 makes this threshold really big effectively
// making it so that we very rarely check mvs on
@@ -1619,11 +1819,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
this_mode, mbmi->ref_frame))
continue;
- vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
- vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
- sizeof(bsi->rdstat[i][mode_idx].ta));
- vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
- sizeof(bsi->rdstat[i][mode_idx].tl));
+ memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+ memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+ sizeof(bsi->rdstat[i][mode_idx].ta));
+ memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+ sizeof(bsi->rdstat[i][mode_idx].tl));
// motion search for newmv (single predictor case only)
if (!has_second_rf && this_mode == NEWMV &&
@@ -1799,8 +1999,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (!subpelmv && have_ref &&
ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
- vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
- sizeof(SEG_RDSTAT));
+ memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+ sizeof(SEG_RDSTAT));
if (num_4x4_blocks_wide > 1)
bsi->rdstat[i + 1][mode_idx].eobs =
ref_bsi->rdstat[i + 1][mode_idx].eobs;
@@ -1848,12 +2048,12 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (midx = 0; midx < INTER_MODES; ++midx)
bsi->rdstat[iy][midx].brdcost = INT64_MAX;
bsi->segment_rd = INT64_MAX;
- return INT64_MAX;;
+ return INT64_MAX;
}
mode_idx = INTER_OFFSET(mode_selected);
- vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
- vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+ memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+ memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
@@ -1871,7 +2071,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (midx = 0; midx < INTER_MODES; ++midx)
bsi->rdstat[iy][midx].brdcost = INT64_MAX;
bsi->segment_rd = INT64_MAX;
- return INT64_MAX;;
+ return INT64_MAX;
}
}
} /* for each label */
@@ -1920,8 +2120,8 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm,
int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
if (seg_ref_active) {
- vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
- vpx_memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+ memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+ memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
*comp_mode_p = 128;
} else {
vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
@@ -1985,14 +2185,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
ctx->skip = x->skip;
ctx->skippable = skippable;
ctx->best_mode_index = mode_index;
- ctx->mic = *xd->mi[0].src_mi;
+ ctx->mic = *xd->mi[0];
ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
- vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
- vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
- sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
+ memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
+ memcpy(ctx->best_filter_diff, best_filter_diff,
+ sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
}
static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2006,16 +2206,19 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
const VP9_COMMON *cm = &cpi->common;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *const mi = xd->mi[0].src_mi;
+ MODE_INFO *const mi = xd->mi[0];
int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+ assert(yv12 != NULL);
+
// TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
// use the UV scaling factors.
vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
// Gets an initial list of candidate vectors from neighbours and orders them
- vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
+ vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
+ NULL, NULL);
// Candidate refinement carried out at encoder and decoder
vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
@@ -2036,7 +2239,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *tmp_mv, int *rate_mv) {
MACROBLOCKD *xd = &x->e_mbd;
const VP9_COMMON *cm = &cpi->common;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
int bestsme = INT_MAX;
int step_param;
@@ -2093,24 +2296,27 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->sf.adaptive_motion_search) {
int bwl = b_width_log2_lookup[bsize];
int bhl = b_height_log2_lookup[bsize];
- int i;
int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
if (tlevel < 5)
step_param += 2;
- for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
- if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
- x->pred_mv[ref].row = 0;
- x->pred_mv[ref].col = 0;
- tmp_mv->as_int = INVALID_MV;
-
- if (scaled_ref_frame) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++)
- xd->plane[i].pre[0] = backup_yv12[i];
+ // prev_mv_sad is not setup for dynamically scaled frames.
+ if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+ if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+ x->pred_mv[ref].row = 0;
+ x->pred_mv[ref].col = 0;
+ tmp_mv->as_int = INVALID_MV;
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return;
}
- return;
}
}
}
@@ -2154,189 +2360,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
}
-static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize,
- int_mv *frame_mv,
- int mi_row, int mi_col,
- int_mv single_newmv[MAX_REF_FRAMES],
- int *rate_mv) {
- const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
- const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
- const int refs[2] = { mbmi->ref_frame[0],
- mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
- int_mv ref_mv[2];
- int ite, ref;
- // Prediction buffer from second frame.
-#if CONFIG_VP9_HIGHBITDEPTH
- uint8_t *second_pred;
- uint8_t *second_pred_alloc;
-#else
- uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-#endif // CONFIG_VP9_HIGHBITDEPTH
- const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
-
- // Do joint motion search in compound mode to get more accurate mv.
- struct buf_2d backup_yv12[2][MAX_MB_PLANE];
- struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
- int last_besterr[2] = {INT_MAX, INT_MAX};
- const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
- vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
- vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
- };
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
- second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
- } else {
- second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
- second_pred = second_pred_alloc;
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- for (ref = 0; ref < 2; ++ref) {
- ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
-
- if (scaled_ref_frame[ref]) {
- int i;
- // Swap out the reference frame for a version that's been scaled to
- // match the resolution of the current frame, allowing the existing
- // motion search code to be used without additional modifications.
- for (i = 0; i < MAX_MB_PLANE; i++)
- backup_yv12[ref][i] = xd->plane[i].pre[ref];
- vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
- NULL);
- }
-
- frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
- }
-
- // Allow joint search multiple times iteratively for each ref frame
- // and break out the search loop if it couldn't find better mv.
- for (ite = 0; ite < 4; ite++) {
- struct buf_2d ref_yv12[2];
- int bestsme = INT_MAX;
- int sadpb = x->sadperbit16;
- MV tmp_mv;
- int search_range = 3;
-
- int tmp_col_min = x->mv_col_min;
- int tmp_col_max = x->mv_col_max;
- int tmp_row_min = x->mv_row_min;
- int tmp_row_max = x->mv_row_max;
- int id = ite % 2;
- // Initialized here because of compiler problem in Visual Studio.
- ref_yv12[0] = xd->plane[0].pre[0];
- ref_yv12[1] = xd->plane[0].pre[1];
-
- // Get pred block from second frame.
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
- ref_yv12[!id].stride,
- second_pred, pw,
- &frame_mv[refs[!id]].as_mv,
- &xd->block_refs[!id]->sf,
- pw, ph, 0,
- kernel, MV_PRECISION_Q3,
- mi_col * MI_SIZE, mi_row * MI_SIZE,
- xd->bd);
- } else {
- vp9_build_inter_predictor(ref_yv12[!id].buf,
- ref_yv12[!id].stride,
- second_pred, pw,
- &frame_mv[refs[!id]].as_mv,
- &xd->block_refs[!id]->sf,
- pw, ph, 0,
- kernel, MV_PRECISION_Q3,
- mi_col * MI_SIZE, mi_row * MI_SIZE);
- }
-#else
- vp9_build_inter_predictor(ref_yv12[!id].buf,
- ref_yv12[!id].stride,
- second_pred, pw,
- &frame_mv[refs[!id]].as_mv,
- &xd->block_refs[!id]->sf,
- pw, ph, 0,
- kernel, MV_PRECISION_Q3,
- mi_col * MI_SIZE, mi_row * MI_SIZE);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- // Compound motion search on first ref frame.
- if (id)
- xd->plane[0].pre[0] = ref_yv12[id];
- vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
-
- // Use mv result from single mode as mvp.
- tmp_mv = frame_mv[refs[id]].as_mv;
-
- tmp_mv.col >>= 3;
- tmp_mv.row >>= 3;
-
- // Small-range full-pixel motion search
- bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
- search_range,
- &cpi->fn_ptr[bsize],
- &ref_mv[id].as_mv, second_pred);
- if (bestsme < INT_MAX)
- bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
- second_pred, &cpi->fn_ptr[bsize], 1);
-
- x->mv_col_min = tmp_col_min;
- x->mv_col_max = tmp_col_max;
- x->mv_row_min = tmp_row_min;
- x->mv_row_max = tmp_row_max;
-
- if (bestsme < INT_MAX) {
- int dis; /* TODO: use dis in distortion calculation later. */
- unsigned int sse;
- bestsme = cpi->find_fractional_mv_step(
- x, &tmp_mv,
- &ref_mv[id].as_mv,
- cpi->common.allow_high_precision_mv,
- x->errorperbit,
- &cpi->fn_ptr[bsize],
- 0, cpi->sf.mv.subpel_iters_per_step,
- NULL,
- x->nmvjointcost, x->mvcost,
- &dis, &sse, second_pred,
- pw, ph);
- }
-
- if (id)
- xd->plane[0].pre[0] = scaled_first_yv12;
-
- if (bestsme < last_besterr[id]) {
- frame_mv[refs[id]].as_mv = tmp_mv;
- last_besterr[id] = bestsme;
- } else {
- break;
- }
- }
-
- *rate_mv = 0;
-
- for (ref = 0; ref < 2; ++ref) {
- if (scaled_ref_frame[ref]) {
- // restore the predictor
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++)
- xd->plane[i].pre[ref] = backup_yv12[ref][i];
- }
-
- *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
- &mbmi->ref_mvs[refs[ref]][0].as_mv,
- x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
- }
-
-#if CONFIG_VP9_HIGHBITDEPTH
- vpx_free(second_pred_alloc);
-#else
- vpx_free(second_pred);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-}
static INLINE void restore_dst_buf(MACROBLOCKD *xd,
uint8_t *orig_dst[MAX_MB_PLANE],
@@ -2348,6 +2372,27 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
}
}
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP9_COMP *cpi,
+ int this_mode,
+ int_mv this_mv,
+ int_mv (*mode_mv)[MAX_REF_FRAMES],
+ int ref_frame) {
+ return (!cpi->rc.is_src_frame_alt_ref &&
+ (this_mode == NEWMV) &&
+ (this_mv.as_int != 0) &&
+ ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+ ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize,
int64_t txfm_cache[],
@@ -2361,11 +2406,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
int (*single_skippable)[MAX_REF_FRAMES],
int64_t *psse,
- const int64_t ref_best_rd) {
+ const int64_t ref_best_rd,
+ int64_t *mask_filter,
+ int64_t filter_cache[]) {
VP9_COMMON *cm = &cpi->common;
- RD_OPT *rd_opt = &cpi->rd;
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
const int is_comp_pred = has_second_ref(mbmi);
const int this_mode = mbmi->mode;
int_mv *frame_mv = mode_mv[this_mode];
@@ -2374,11 +2420,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv cur_mv[2];
#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
- DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
+ DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
uint8_t *tmp_buf;
#else
- DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
#endif // CONFIG_VP9_HIGHBITDEPTH
int pred_exists = 0;
int intpel_mv;
@@ -2404,16 +2449,16 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
} else {
- tmp_buf = tmp_buf8;
+ tmp_buf = (uint8_t *)tmp_buf16;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
if (pred_filter_search) {
INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
if (xd->up_available)
- af = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
+ af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
if (xd->left_available)
- lf = xd->mi[-1].src_mi->mbmi.interp_filter;
+ lf = xd->mi[-1]->mbmi.interp_filter;
if ((this_mode != NEWMV) || (af == lf))
best_filter = af;
@@ -2456,10 +2501,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
&tmp_mv, &rate_mv);
if (tmp_mv.as_int == INVALID_MV)
return INT64_MAX;
- *rate2 += rate_mv;
+
frame_mv[refs[0]].as_int =
- xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+ xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+ // Estimate the rate implications of a new mv but discount this
+ // under certain circumstances where we want to help initiate a weak
+ // motion field, where the distortion gain for a single block may not
+ // be enough to overcome the cost of a new mv.
+ if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+ *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+ } else {
+ *rate2 += rate_mv;
+ }
}
}
@@ -2484,11 +2539,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
orig_dst_stride[i] = xd->plane[i].dst.stride;
}
- /* We don't include the cost of the second reference here, because there
- * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
- * words if you present them in that order, the second one is always known
- * if the first is known */
- *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+ // We don't include the cost of the second reference here, because there
+ // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+ // words if you present them in that order, the second one is always known
+ // if the first is known.
+ //
+ // Under some circumstances we discount the cost of new mv mode to encourage
+ // initiation of a motion field.
+ if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+ mode_mv, refs[0])) {
+ *rate2 += MIN(cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]),
+ cost_mv_ref(cpi, NEARESTMV, mbmi->mode_context[refs[0]]));
+ } else {
+ *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+ }
if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
mbmi->mode != NEARESTMV)
@@ -2502,9 +2566,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// Search for best switchable filter by checking the variance of
// pred error irrespective of whether the filter will be used
- rd_opt->mask_filter = 0;
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- rd_opt->filter_cache[i] = INT64_MAX;
+ filter_cache[i] = INT64_MAX;
if (cm->interp_filter != BILINEAR) {
if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
@@ -2521,17 +2584,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int64_t tmp_skip_sse = INT64_MAX;
mbmi->interp_filter = i;
- rs = vp9_get_switchable_rate(cpi);
+ rs = vp9_get_switchable_rate(cpi, xd);
rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
if (i > 0 && intpel_mv) {
rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
- rd_opt->filter_cache[i] = rd;
- rd_opt->filter_cache[SWITCHABLE_FILTERS] =
- MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+ filter_cache[i] = rd;
+ filter_cache[SWITCHABLE_FILTERS] =
+ MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
if (cm->interp_filter == SWITCHABLE)
rd += rs_rd;
- rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+ *mask_filter = MAX(*mask_filter, rd);
} else {
int rate_sum = 0;
int64_t dist_sum = 0;
@@ -2559,12 +2622,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
&tmp_skip_sb, &tmp_skip_sse);
rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
- rd_opt->filter_cache[i] = rd;
- rd_opt->filter_cache[SWITCHABLE_FILTERS] =
- MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+ filter_cache[i] = rd;
+ filter_cache[SWITCHABLE_FILTERS] =
+ MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
if (cm->interp_filter == SWITCHABLE)
rd += rs_rd;
- rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+ *mask_filter = MAX(*mask_filter, rd);
if (i == 0 && intpel_mv) {
tmp_rate_sum = rate_sum;
@@ -2595,8 +2658,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
skip_txfm_sb = tmp_skip_sb;
skip_sse_sb = tmp_skip_sse;
- vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
- vpx_memcpy(bsse, x->bsse, sizeof(bsse));
+ memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+ memcpy(bsse, x->bsse, sizeof(bsse));
}
}
restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2605,7 +2668,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// Set the appropriate filter
mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
cm->interp_filter : best_filter;
- rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
+ rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
if (pred_exists) {
if (best_needs_copy) {
@@ -2626,8 +2689,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
&skip_txfm_sb, &skip_sse_sb);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
- vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
- vpx_memcpy(bsse, x->bsse, sizeof(bsse));
+ memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+ memcpy(bsse, x->bsse, sizeof(bsse));
}
if (!is_comp_pred)
@@ -2637,7 +2700,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (is_comp_pred)
if (single_skippable[this_mode][refs[0]] &&
single_skippable[this_mode][refs[1]])
- vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
+ memset(skip_txfm, 1, sizeof(skip_txfm));
if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
// if current pred_error modeled rd is substantially more than the best
@@ -2651,8 +2714,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (cm->interp_filter == SWITCHABLE)
*rate2 += rs;
- vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
- vpx_memcpy(x->bsse, bsse, sizeof(bsse));
+ memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+ memcpy(x->bsse, bsse, sizeof(bsse));
if (!skip_txfm_sb) {
int skippable_y, skippable_uv;
@@ -2718,7 +2781,8 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
TX_SIZE max_uv_tx_size;
x->skip_encode = 0;
ctx->skip = 0;
- xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->mbmi.ref_frame[1] = NONE;
if (bsize >= BLOCK_8X8) {
if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -2735,7 +2799,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
return;
}
}
- max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0].src_mi->mbmi.tx_size, bsize,
+ max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
pd[1].subsampling_x,
pd[1].subsampling_y);
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
@@ -2761,43 +2825,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- ctx->mic = *xd->mi[0].src_mi;
+ ctx->mic = *xd->mi[0];
rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
}
-static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
- int best_mode_index) {
- if (cpi->sf.adaptive_rd_thresh > 0) {
- const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
- int mode;
- for (mode = 0; mode < top_mode; ++mode) {
- const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
- const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
- BLOCK_SIZE bs;
- for (bs = min_size; bs <= max_size; ++bs) {
- int *const fact = &cpi->rd.thresh_freq_fact[bs][mode];
- if (mode == best_mode_index) {
- *fact -= (*fact >> 4);
- } else {
- *fact = MIN(*fact + RD_THRESH_INC,
- cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
- }
- }
- }
+// This function is designed to apply a bias or adjustment to an rd value based
+// on the relative variance of the source and reconstruction.
+#define LOW_VAR_THRESH 16
+#define VLOW_ADJ_MAX 25
+#define VHIGH_ADJ_MAX 8
+static void rd_variance_adjustment(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ int64_t *this_rd,
+ MV_REFERENCE_FRAME ref_frame,
+ unsigned int source_variance) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int recon_variance;
+ unsigned int absvar_diff = 0;
+ int64_t var_error = 0;
+ int64_t var_factor = 0;
+
+ if (*this_rd == INT64_MAX)
+ return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ recon_variance =
+ vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+ } else {
+ recon_variance =
+ vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+ }
+#else
+ recon_variance =
+ vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
+ absvar_diff = (source_variance > recon_variance)
+ ? (source_variance - recon_variance)
+ : (recon_variance - source_variance);
+
+ var_error = (200 * source_variance * recon_variance) /
+ ((source_variance * source_variance) +
+ (recon_variance * recon_variance));
+ var_error = 100 - var_error;
+ }
+
+ // Source variance above a threshold and ref frame is intra.
+ // This case is targeted mainly at discouraging intra modes that give rise
+ // to a predictor with a low spatial complexity compared to the source.
+ if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
+ (source_variance > recon_variance)) {
+ var_factor = MIN(absvar_diff, MIN(VLOW_ADJ_MAX, var_error));
+ // A second possible case of interest is where the source variance
+ // is very low and we wish to discourage false texture or motion trails.
+ } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
+ (recon_variance > source_variance)) {
+ var_factor = MIN(absvar_diff, MIN(VHIGH_ADJ_MAX, var_error));
}
+ *this_rd += (*this_rd * var_factor) / 100;
}
-void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- const TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
+ TileDataEnc *tile_data,
+ MACROBLOCK *x,
int mi_row, int mi_col,
RD_COST *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far) {
VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
RD_OPT *const rd_opt = &cpi->rd;
SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const struct segmentation *const seg = &cm->seg;
PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
@@ -2836,14 +2939,20 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
int mode_skip_start = sf->mode_skip_start + 1;
const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
- const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
+ const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
int64_t mode_threshold[MAX_MODES];
- int *mode_map = rd_opt->mode_map[bsize];
+ int *mode_map = tile_data->mode_map[bsize];
const int mode_search_skip_flags = sf->mode_search_skip_flags;
+ int64_t mask_filter = 0;
+ int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+
vp9_zero(best_mbmode);
x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ filter_cache[i] = INT64_MAX;
+
estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
&comp_mode_p);
@@ -2869,7 +2978,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
x->pred_mv_sad[ref_frame] = INT_MAX;
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col,
+ assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+ setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -2948,7 +3058,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
mode_skip_mask[INTRA_FRAME] |=
~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
- for (i = 0; i < MAX_MODES; ++i)
+ for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+ mode_threshold[i] = 0;
+ for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
midx = sf->schedule_mode_search ? mode_skip_start : 0;
@@ -3007,8 +3119,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
- ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+ (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
continue;
if (mode_skip_mask[ref_frame] & (1 << this_mode))
@@ -3023,9 +3135,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (sf->motion_field_mode_search) {
const int mi_width = MIN(num_8x8_blocks_wide_lookup[bsize],
- tile->mi_col_end - mi_col);
+ tile_info->mi_col_end - mi_col);
const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
- tile->mi_row_end - mi_row);
+ tile_info->mi_row_end - mi_row);
const int bsl = mi_width_log2_lookup[bsize];
int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
+ get_chessboard_index(cm->current_video_frame)) & 0x1;
@@ -3036,24 +3148,24 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int_mv ref_mv;
ref_mv.as_int = INVALID_MV;
- if ((mi_row - 1) >= tile->mi_row_start) {
- ref_mv = xd->mi[-xd->mi_stride].src_mi->mbmi.mv[0];
- rf = xd->mi[-xd->mi_stride].src_mi->mbmi.ref_frame[0];
+ if ((mi_row - 1) >= tile_info->mi_row_start) {
+ ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
+ rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
for (i = 0; i < mi_width; ++i) {
- ref_mbmi = &xd->mi[-xd->mi_stride + i].src_mi->mbmi;
+ ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
(ref_frame == ref_mbmi->ref_frame[0]);
skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
}
}
- if ((mi_col - 1) >= tile->mi_col_start) {
+ if ((mi_col - 1) >= tile_info->mi_col_start) {
if (ref_mv.as_int == INVALID_MV)
- ref_mv = xd->mi[-1].src_mi->mbmi.mv[0];
+ ref_mv = xd->mi[-1]->mbmi.mv[0];
if (rf == NONE)
- rf = xd->mi[-1].src_mi->mbmi.ref_frame[0];
+ rf = xd->mi[-1]->mbmi.ref_frame[0];
for (i = 0; i < mi_height; ++i) {
- ref_mbmi = &xd->mi[i * xd->mi_stride - 1].src_mi->mbmi;
+ ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
(ref_frame == ref_mbmi->ref_frame[0]);
skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
@@ -3072,7 +3184,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
- if (!cm->allow_comp_inter_inter)
+ if (!cpi->allow_comp_inter_inter)
continue;
// Skip compound inter modes if ARF is not available.
@@ -3153,7 +3265,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
struct macroblockd_plane *const pd = &xd->plane[1];
- vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
NULL, bsize, tx_cache, best_rd);
if (rate_y == INT_MAX)
@@ -3162,7 +3274,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
pd->subsampling_y);
if (rate_uv_intra[uv_tx] == INT_MAX) {
- choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+ choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
&rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
&dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
}
@@ -3184,7 +3296,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
&disable_skip, frame_mv,
mi_row, mi_col,
single_newmv, single_inter_filter,
- single_skippable, &total_sse, best_rd);
+ single_skippable, &total_sse, best_rd,
+ &mask_filter, filter_cache);
if (this_rd == INT64_MAX)
continue;
@@ -3231,6 +3344,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
}
+ // Apply an adjustment to the rd value based on the similarity of the
+ // source variance and reconstructed variance.
+ rd_variance_adjustment(cpi, x, bsize, &this_rd,
+ ref_frame, x->source_variance);
+
if (ref_frame == INTRA_FRAME) {
// Keep record of best intra rd
if (this_rd < best_intra_rd) {
@@ -3271,8 +3389,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
- vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
- sizeof(uint8_t) * ctx->num_4x4_blk);
+ memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
// TODO(debargha): enhance this test with a better distortion prediction
// based on qp, activity mask and history
@@ -3325,21 +3443,21 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
/* keep record of best filter type */
if (!mode_excluded && cm->interp_filter != BILINEAR) {
- int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+ int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->interp_filter];
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
int64_t adj_rd;
if (ref == INT64_MAX)
adj_rd = 0;
- else if (rd_opt->filter_cache[i] == INT64_MAX)
+ else if (filter_cache[i] == INT64_MAX)
// when early termination is triggered, the encoder does not have
// access to the rate-distortion cost. it only knows that the cost
// should be above the maximum valid value. hence it takes the known
// maximum plus an arbitrary constant as the rate-distortion cost.
- adj_rd = rd_opt->mask_filter - ref + 10;
+ adj_rd = mask_filter - ref + 10;
else
- adj_rd = rd_opt->filter_cache[i] - ref;
+ adj_rd = filter_cache[i] - ref;
adj_rd += this_rd;
best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
@@ -3420,7 +3538,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
!is_inter_block(&best_mbmode));
if (!cpi->rc.is_src_frame_alt_ref)
- update_rd_thresh_fact(cpi, bsize, best_mode_index);
+ vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize, best_mode_index);
// macroblock modes
*mbmi = best_mbmode;
@@ -3460,7 +3579,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->skip && !x->select_tx_size) {
int has_high_freq_coeff = 0;
int plane;
- int max_plane = is_inter_block(&xd->mi[0].src_mi->mbmi)
+ int max_plane = is_inter_block(&xd->mi[0]->mbmi)
? MAX_MB_PLANE : 1;
for (plane = 0; plane < max_plane; ++plane) {
x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
@@ -3475,19 +3594,22 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_mode_skippable |= !has_high_freq_coeff;
}
+ assert(best_mode_index >= 0);
+
store_coding_context(x, ctx, best_mode_index, best_pred_diff,
best_tx_diff, best_filter_diff, best_mode_skippable);
}
-void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
+ TileDataEnc *tile_data,
+ MACROBLOCK *x,
RD_COST *rd_cost,
BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far) {
VP9_COMMON *const cm = &cpi->common;
- RD_OPT *const rd_opt = &cpi->rd;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
unsigned char segment_id = mbmi->segment_id;
const int comp_pred = 0;
int i;
@@ -3522,12 +3644,6 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->mv[0].as_int = 0;
x->skip = 1;
- // Search for best switchable filter by checking the variance of
- // pred error irrespective of whether the filter will be used
- rd_opt->mask_filter = 0;
- for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- rd_opt->filter_cache[i] = INT64_MAX;
-
if (cm->interp_filter != BILINEAR) {
best_filter = EIGHTTAP;
if (cm->interp_filter == SWITCHABLE &&
@@ -3536,7 +3652,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
int best_rs = INT_MAX;
for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
mbmi->interp_filter = i;
- rs = vp9_get_switchable_rate(cpi);
+ rs = vp9_get_switchable_rate(cpi, xd);
if (rs < best_rs) {
best_rs = rs;
best_filter = mbmi->interp_filter;
@@ -3547,7 +3663,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
// Set the appropriate filter
if (cm->interp_filter == SWITCHABLE) {
mbmi->interp_filter = best_filter;
- rate2 += vp9_get_switchable_rate(cpi);
+ rate2 += vp9_get_switchable_rate(cpi, xd);
} else {
mbmi->interp_filter = cm->interp_filter;
}
@@ -3573,7 +3689,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
assert((cm->interp_filter == SWITCHABLE) ||
(cm->interp_filter == mbmi->interp_filter));
- update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
+ vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+ cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
vp9_zero(best_pred_diff);
vp9_zero(best_filter_diff);
@@ -3585,18 +3702,20 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_diff, best_tx_diff, best_filter_diff, 0);
}
-void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
- const TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
+ TileDataEnc *tile_data,
+ MACROBLOCK *x,
int mi_row, int mi_col,
RD_COST *rd_cost,
BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far) {
VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
RD_OPT *const rd_opt = &cpi->rd;
SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const struct segmentation *const seg = &cm->seg;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = mbmi->segment_id;
@@ -3627,11 +3746,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
b_mode_info best_bmodes[4];
int best_skip2 = 0;
int ref_frame_skip_mask[2] = { 0 };
+ int64_t mask_filter = 0;
+ int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
- vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+ memset(x->zcoeff_blk[TX_4X4], 0, 4);
vp9_zero(best_mbmode);
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ filter_cache[i] = INT64_MAX;
+
for (i = 0; i < 4; i++) {
int j;
for (j = 0; j < MAX_REF_FRAMES; j++)
@@ -3651,10 +3775,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, tile,
- ref_frame, bsize, mi_row, mi_col,
- frame_mv[NEARESTMV], frame_mv[NEARMV],
- yv12_mb);
+ setup_buffer_inter(cpi, x, tile_info,
+ ref_frame, bsize, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV],
+ yv12_mb);
} else {
ref_frame_skip_mask[0] |= (1 << ref_frame);
ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -3705,19 +3829,19 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
- ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+ (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
continue;
// Test best rd so far against threshold for trying this mode.
if (rd_less_than_thresh(best_rd,
rd_opt->threshes[segment_id][bsize][ref_index],
- rd_opt->thresh_freq_fact[bsize][ref_index]))
+ tile_data->thresh_freq_fact[bsize][ref_index]))
continue;
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
- if (!cm->allow_comp_inter_inter)
+ if (!cpi->allow_comp_inter_inter)
continue;
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
@@ -3791,7 +3915,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
distortion2 += distortion_y;
if (rate_uv_intra == INT_MAX) {
- choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
+ choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
&rate_uv_intra,
&rate_uv_tokenonly,
&dist_uv, &skip_uv,
@@ -3824,9 +3948,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
rd_opt->threshes[segment_id][bsize][THR_ALTR];
this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
- rd_opt->mask_filter = 0;
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- rd_opt->filter_cache[i] = INT64_MAX;
+ filter_cache[i] = INT64_MAX;
if (cm->interp_filter != BILINEAR) {
tmp_best_filter = EIGHTTAP;
@@ -3845,7 +3968,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
int newbest, rs;
int64_t rs_rd;
mbmi->interp_filter = switchable_filter_index;
- tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+ tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
&mbmi->ref_mvs[ref_frame][0],
second_ref, best_yrd, &rate,
&rate_y, &distortion,
@@ -3856,16 +3979,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (tmp_rd == INT64_MAX)
continue;
- rs = vp9_get_switchable_rate(cpi);
+ rs = vp9_get_switchable_rate(cpi, xd);
rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
- rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
- rd_opt->filter_cache[SWITCHABLE_FILTERS] =
- MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS],
+ filter_cache[switchable_filter_index] = tmp_rd;
+ filter_cache[SWITCHABLE_FILTERS] =
+ MIN(filter_cache[SWITCHABLE_FILTERS],
tmp_rd + rs_rd);
if (cm->interp_filter == SWITCHABLE)
tmp_rd += rs_rd;
- rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd);
+ mask_filter = MAX(mask_filter, tmp_rd);
newbest = (tmp_rd < tmp_best_rd);
if (newbest) {
@@ -3883,7 +4006,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
tmp_best_skippable = skippable;
tmp_best_mbmode = *mbmi;
for (i = 0; i < 4; i++) {
- tmp_best_bmodes[i] = xd->mi[0].src_mi->bmi[i];
+ tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
}
pred_exists = 1;
@@ -3911,7 +4034,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (!pred_exists) {
// Handles the special case when a filter that is not in the
// switchable list (bilinear, 6-tap) is indicated at the frame level
- tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+ tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
&mbmi->ref_mvs[ref_frame][0],
second_ref, best_yrd, &rate, &rate_y,
&distortion, &skippable, &total_sse,
@@ -3927,14 +4050,14 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
skippable = tmp_best_skippable;
*mbmi = tmp_best_mbmode;
for (i = 0; i < 4; i++)
- xd->mi[0].src_mi->bmi[i] = tmp_best_bmodes[i];
+ xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
}
rate2 += rate;
distortion2 += distortion;
if (cm->interp_filter == SWITCHABLE)
- rate2 += vp9_get_switchable_rate(cpi);
+ rate2 += vp9_get_switchable_rate(cpi, xd);
if (!mode_excluded)
mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -3951,7 +4074,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
// then dont bother looking at UV
vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
BLOCK_8X8);
- vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
&uv_sse, BLOCK_8X8, tmp_best_rdu))
continue;
@@ -4032,11 +4155,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
best_skip2 = this_skip2;
if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
- vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
- sizeof(uint8_t) * ctx->num_4x4_blk);
+ memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
for (i = 0; i < 4; i++)
- best_bmodes[i] = xd->mi[0].src_mi->bmi[i];
+ best_bmodes[i] = xd->mi[0]->bmi[i];
// TODO(debargha): enhance this test with a better distortion prediction
// based on qp, activity mask and history
@@ -4089,20 +4212,20 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
/* keep record of best filter type */
if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
cm->interp_filter != BILINEAR) {
- int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+ int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->interp_filter];
int64_t adj_rd;
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
if (ref == INT64_MAX)
adj_rd = 0;
- else if (rd_opt->filter_cache[i] == INT64_MAX)
+ else if (filter_cache[i] == INT64_MAX)
// when early termination is triggered, the encoder does not have
// access to the rate-distortion cost. it only knows that the cost
// should be above the maximum valid value. hence it takes the known
// maximum plus an arbitrary constant as the rate-distortion cost.
- adj_rd = rd_opt->mask_filter - ref + 10;
+ adj_rd = mask_filter - ref + 10;
else
- adj_rd = rd_opt->filter_cache[i] - ref;
+ adj_rd = filter_cache[i] - ref;
adj_rd += this_rd;
best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
@@ -4146,21 +4269,21 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
(cm->interp_filter == best_mbmode.interp_filter) ||
!is_inter_block(&best_mbmode));
- update_rd_thresh_fact(cpi, bsize, best_ref_index);
+ vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize, best_ref_index);
// macroblock modes
*mbmi = best_mbmode;
x->skip |= best_skip2;
if (!is_inter_block(&best_mbmode)) {
for (i = 0; i < 4; i++)
- xd->mi[0].src_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
+ xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
} else {
for (i = 0; i < 4; ++i)
- vpx_memcpy(&xd->mi[0].src_mi->bmi[i], &best_bmodes[i],
- sizeof(b_mode_info));
+ memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
- mbmi->mv[0].as_int = xd->mi[0].src_mi->bmi[3].as_mv[0].as_int;
- mbmi->mv[1].as_int = xd->mi[0].src_mi->bmi[3].as_mv[1].as_int;
+ mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
}
for (i = 0; i < REFERENCE_MODES; ++i) {
@@ -4186,4 +4309,3 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
store_coding_context(x, ctx, best_ref_index,
best_pred_diff, best_tx_diff, best_filter_diff, 0);
}
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
index ed38ce81a2e..459b0324bcf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
@@ -29,14 +29,25 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
struct RD_COST *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
-void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
- const struct TileInfo *const tile,
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd);
+#endif
+
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x,
int mi_row, int mi_col,
struct RD_COST *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far);
void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+ struct TileDataEnc *tile_data,
struct macroblock *x,
struct RD_COST *rd_cost,
BLOCK_SIZE bsize,
@@ -44,8 +55,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
int64_t best_rd_so_far);
void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+ struct TileDataEnc *tile_data,
struct macroblock *x,
- const struct TileInfo *const tile,
int mi_row, int mi_col,
struct RD_COST *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
index 3d361d4f267..2ebdff291d6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
@@ -516,6 +516,10 @@ void vp9_resize_plane(const uint8_t *const input,
uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
(width < height ? height : width));
uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
for (i = 0; i < height; ++i)
resize_multistep(input + in_stride * i, width,
intbuf + width2 * i, width2, tmpbuf);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
index f1d51770ab3..9b15072e98e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
@@ -36,11 +36,7 @@ void vp9_set_segment_data(struct segmentation *seg,
unsigned char abs_delta) {
seg->abs_delta = abs_delta;
- vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
-
- // TBD ?? Set the feature mask
- // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
- // sizeof(cpi->mb.e_mbd.segment_feature_mask));
+ memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
}
void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
@@ -111,7 +107,7 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
}
static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
- const TileInfo *tile, MODE_INFO *mi,
+ const TileInfo *tile, MODE_INFO **mi,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -122,7 +118,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
return;
xd->mi = mi;
- segment_id = xd->mi[0].src_mi->mbmi.segment_id;
+ segment_id = xd->mi[0]->mbmi.segment_id;
set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
@@ -131,7 +127,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
// Temporal prediction not allowed on key frames
if (cm->frame_type != KEY_FRAME) {
- const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
bsize, mi_row, mi_col);
@@ -140,7 +136,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
// Store the prediction status for this mb and update counts
// as appropriate
- xd->mi[0].src_mi->mbmi.seg_id_predicted = pred_flag;
+ xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
temporal_predictor_count[pred_context][pred_flag]++;
// Update the "unpredicted" segment count
@@ -150,7 +146,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
}
static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
- const TileInfo *tile, MODE_INFO *mi,
+ const TileInfo *tile, MODE_INFO **mi,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -163,8 +159,8 @@ static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bw = num_8x8_blocks_wide_lookup[mi[0].src_mi->mbmi.sb_type];
- bh = num_8x8_blocks_high_lookup[mi[0].src_mi->mbmi.sb_type];
+ bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
+ bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
if (bw == bs && bh == bs) {
count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
@@ -217,20 +213,20 @@ void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
// Set default state for the segment tree probabilities and the
// temporal coding probabilities
- vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
- vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
+ memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+ memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
// First of all generate stats regarding how well the last segment map
// predicts this one
for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
TileInfo tile;
- MODE_INFO *mi_ptr;
+ MODE_INFO **mi_ptr;
vp9_tile_init(&tile, cm, 0, tile_col);
- mi_ptr = cm->mi + tile.mi_col_start;
+ mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
for (mi_row = 0; mi_row < cm->mi_rows;
mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
- MODE_INFO *mi = mi_ptr;
+ MODE_INFO **mi = mi_ptr;
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += 8, mi += 8)
count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
@@ -267,11 +263,11 @@ void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
// Now choose which coding method to use.
if (t_pred_cost < no_pred_cost) {
seg->temporal_update = 1;
- vpx_memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
- vpx_memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+ memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+ memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
} else {
seg->temporal_update = 0;
- vpx_memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
+ memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
}
}
@@ -280,6 +276,6 @@ void vp9_reset_segment_features(struct segmentation *seg) {
seg->enabled = 0;
seg->update_map = 0;
seg->update_data = 0;
- vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+ memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
vp9_clearall_segfeatures(seg);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c
new file mode 100644
index 00000000000..1cb0662834e
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614}; // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157}; // q16
+static const int skin_threshold = 1570636; // q18
+
+// Thresholds on luminance.
+static const int y_low = 20;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr) {
+ const int cb_q6 = cb << 6;
+ const int cr_q6 = cr << 6;
+ const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+ const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+ const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+ const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+ const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+ const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+ const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+ skin_inv_cov[1] * cbcr_diff_q2 +
+ skin_inv_cov[2] * cbcr_diff_q2 +
+ skin_inv_cov[3] * cr_diff_q2;
+ return skin_diff;
+}
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+ if (y < y_low || y > y_high)
+ return 0;
+ else
+ return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+ int i, j, mi_row, mi_col;
+ VP9_COMMON *const cm = &cpi->common;
+ uint8_t *y;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int src_uvstride = cpi->Source->uv_stride;
+ YV12_BUFFER_CONFIG skinmap;
+ memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+ if (vp9_alloc_frame_buffer(&skinmap, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment)) {
+ vp9_free_frame_buffer(&skinmap);
+ return;
+ }
+ memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+ y = skinmap.y_buffer;
+ // Loop through 8x8 blocks and set skin map based on center pixel of block.
+ // Set y to white for skin block, otherwise set to source with gray scale.
+ // Ignore rightmost/bottom boundary blocks.
+ for (mi_row = 0; mi_row < cm->mi_rows - 1; ++mi_row) {
+ for (mi_col = 0; mi_col < cm->mi_cols - 1; ++mi_col) {
+ // Use middle pixel for each 8x8 block for skin detection.
+ // If middle pixel is skin, assign whole 8x8 block to skin.
+ const uint8_t ysource = src_y[4 * src_ystride + 4];
+ const uint8_t usource = src_u[2 * src_uvstride + 2];
+ const uint8_t vsource = src_v[2 * src_uvstride + 2];
+ const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ if (is_skin)
+ y[i * src_ystride + j] = 255;
+ else
+ y[i * src_ystride + j] = src_y[i * src_ystride + j];
+ }
+ }
+ y += 8;
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ }
+ y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+ src_y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+ src_u += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+ src_v += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+ }
+ vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+ vp9_free_frame_buffer(&skinmap);
+}
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h
new file mode 100644
index 00000000000..3d4e7375f76
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
+#define VP9_ENCODER_VP9_SKIN_MAP_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP9_ENCODER_VP9_SKIN_MAP_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
index 9e3ee2c94bd..4f93578326b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -16,12 +16,84 @@
// Intra only frames, golden frames (except alt ref overlays) and
// alt ref frames tend to be coded at a higher than ambient quality
static int frame_is_boosted(const VP9_COMP *cpi) {
- return frame_is_intra_only(&cpi->common) ||
- cpi->refresh_alt_ref_frame ||
- (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
- vp9_is_upper_layer_key_frame(cpi);
+ return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi);
}
+static void set_good_speed_feature_framesize_dependent(VP9_COMMON *cm,
+ SPEED_FEATURES *sf,
+ int speed) {
+ if (speed >= 1) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+ : DISABLE_ALL_INTER_SPLIT;
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ } else {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ sf->partition_search_breakout_dist_thr = (1 << 21);
+ }
+ }
+
+ if (speed >= 2) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+ : DISABLE_ALL_INTER_SPLIT;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ sf->partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+ sf->partition_search_breakout_dist_thr = (1 << 22);
+ sf->partition_search_breakout_rate_thr = 100;
+ }
+ }
+
+ if (speed >= 3) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+ sf->partition_search_breakout_dist_thr = (1 << 25);
+ sf->partition_search_breakout_rate_thr = 200;
+ } else {
+ sf->max_intra_bsize = BLOCK_32X32;
+ sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ sf->partition_search_breakout_rate_thr = 120;
+ }
+ }
+
+ if (speed >= 4) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ }
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ }
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ unsigned int screen_area = (cm->width * cm->height);
+
+ // Select block size based on image format size.
+ if (screen_area < 1280 * 720) {
+ // Formats smaller in area than 720P
+ return BLOCK_4X4;
+ } else if (screen_area < 1920 * 1080) {
+ // Format >= 720P and < 1080P
+ return BLOCK_8X8;
+ } else {
+ // Formats 1080P and up
+ return BLOCK_16X16;
+ }
+}
static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
SPEED_FEATURES *sf, int speed) {
@@ -34,11 +106,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->use_square_partition_only = !frame_is_intra_only(cm);
sf->less_rectangular_check = 1;
- if (MIN(cm->width, cm->height) >= 720)
- sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
- : DISABLE_ALL_INTER_SPLIT;
- else
- sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
sf->mv.auto_mv_step_size = 1;
@@ -54,11 +121,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->tx_size_search_breakout = 1;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->partition_search_breakout_dist_thr = (1 << 23);
- else
- sf->partition_search_breakout_dist_thr = (1 << 21);
sf->partition_search_breakout_rate_thr = 80;
}
@@ -66,45 +128,24 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
: USE_LARGESTALL;
- if (MIN(cm->width, cm->height) >= 720) {
- sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
- : DISABLE_ALL_INTER_SPLIT;
- sf->adaptive_pred_interp_filter = 0;
- sf->partition_search_breakout_dist_thr = (1 << 24);
- sf->partition_search_breakout_rate_thr = 120;
- } else {
- sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
- sf->partition_search_breakout_dist_thr = (1 << 22);
- sf->partition_search_breakout_rate_thr = 100;
- }
+ // Reference masking is not supported in dynamic scaling mode.
+ sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
- sf->reference_masking = 1;
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+ FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
FLAG_SKIP_INTRA_LOWVAR;
sf->disable_filter_search_var_thresh = 100;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
- sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
-
+ sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+ sf->rd_auto_partition_min_limit = set_partition_min_limit(cpi);
sf->allow_partition_search_skip = 1;
}
if (speed >= 3) {
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
: USE_LARGESTALL;
- if (MIN(cm->width, cm->height) >= 720) {
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
- sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
- sf->partition_search_breakout_dist_thr = (1 << 25);
- sf->partition_search_breakout_rate_thr = 200;
- } else {
- sf->max_intra_bsize = BLOCK_32X32;
- sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
- sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
- sf->partition_search_breakout_dist_thr = (1 << 23);
- sf->partition_search_breakout_rate_thr = 120;
- }
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 1;
@@ -122,28 +163,21 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
if (speed >= 4) {
sf->use_square_partition_only = 1;
sf->tx_size_search_method = USE_LARGESTALL;
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->mv.search_method = BIGDIA;
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
sf->adaptive_rd_thresh = 4;
- sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+ if (cm->frame_type != KEY_FRAME)
+ sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
sf->disable_filter_search_var_thresh = 200;
sf->use_lp32x32fdct = 1;
sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
sf->motion_field_mode_search = !boosted;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->partition_search_breakout_dist_thr = (1 << 26);
- else
- sf->partition_search_breakout_dist_thr = (1 << 24);
sf->partition_search_breakout_rate_thr = 300;
}
if (speed >= 5) {
int i;
-
- sf->partition_search_type = FIXED_PARTITION;
sf->optimize_coefficients = 0;
sf->mv.search_method = HEX;
sf->disable_filter_search_var_thresh = 500;
@@ -151,12 +185,47 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->intra_y_mode_mask[i] = INTRA_DC;
sf->intra_uv_mode_mask[i] = INTRA_DC;
}
- }
- if (speed >= 6) {
+ sf->partition_search_breakout_rate_thr = 500;
sf->mv.reduce_first_step_size = 1;
}
}
+static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
+ SPEED_FEATURES *sf, int speed) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ if (speed >= 1) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+ : DISABLE_ALL_INTER_SPLIT;
+ } else {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ }
+ }
+
+ if (speed >= 2) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+ : DISABLE_ALL_INTER_SPLIT;
+ } else {
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+ }
+ }
+
+ if (speed >= 5) {
+ if (MIN(cm->width, cm->height) >= 720) {
+ sf->partition_search_breakout_dist_thr = (1 << 25);
+ } else {
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ }
+ }
+
+ if (speed >= 7) {
+ sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
+ 800 : 300;
+ }
+}
+
static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
int speed, vp9e_tune_content content) {
VP9_COMMON *const cm = &cpi->common;
@@ -172,12 +241,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
: USE_LARGESTALL;
- if (MIN(cm->width, cm->height) >= 720)
- sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
- : DISABLE_ALL_INTER_SPLIT;
- else
- sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
@@ -190,22 +253,19 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
}
if (speed >= 2) {
- if (MIN(cm->width, cm->height) >= 720)
- sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
- : DISABLE_ALL_INTER_SPLIT;
- else
- sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+ FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
FLAG_SKIP_INTRA_LOWVAR;
sf->adaptive_pred_interp_filter = 2;
- sf->reference_masking = 1;
+
+ // Reference masking is not supported in dynamic scaling mode.
+ sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+
sf->disable_filter_search_var_thresh = 50;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
sf->adjust_partitioning_from_last_frame = 1;
sf->last_partitioning_redo_frequency = 3;
@@ -217,12 +277,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
if (speed >= 3) {
sf->use_square_partition_only = 1;
sf->disable_filter_search_var_thresh = 100;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
- sf->constrain_copy_partition = 1;
sf->use_uv_intra_rd_estimate = 1;
sf->skip_encode_sb = 1;
sf->mv.subpel_iters_per_step = 1;
- sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->adaptive_rd_thresh = 4;
sf->mode_skip_start = 6;
sf->allow_skip_recode = 0;
@@ -261,10 +318,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->use_quant_fp = !is_keyframe;
sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
: STRICT_NEIGHBORING_MIN_MAX;
- sf->max_partition_size = BLOCK_32X32;
- sf->min_partition_size = BLOCK_8X8;
- sf->partition_check =
- (frames_since_key % sf->last_partitioning_redo_frequency == 1);
+ sf->default_max_partition_size = BLOCK_32X32;
+ sf->default_min_partition_size = BLOCK_8X8;
sf->force_frame_boost = is_keyframe ||
(frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
sf->max_delta_qindex = is_keyframe ? 20 : 15;
@@ -275,63 +330,87 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->partition_search_breakout_dist_thr = (1 << 25);
- else
- sf->partition_search_breakout_dist_thr = (1 << 23);
+ sf->adaptive_rd_thresh = 2;
+ // This feature is only enabled when partition search is disabled.
+ sf->reuse_inter_pred_sby = 1;
sf->partition_search_breakout_rate_thr = 200;
- }
+ sf->coeff_prob_appx_step = 4;
+ sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
- if (speed >= 6) {
- if (content == VP9E_CONTENT_SCREEN) {
+ if (!is_keyframe) {
int i;
- // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
- for (i = 0; i < BLOCK_SIZES; ++i)
- sf->inter_mode_mask[i] = INTER_NEAREST_NEAR_NEW;
+ if (content == VP9E_CONTENT_SCREEN) {
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+ } else {
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_16X16)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+ else
+ // Use H and V intra mode for block sizes <= 16X16.
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+ }
}
+ }
+ if (speed >= 6) {
// Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
sf->partition_search_type = VAR_BASED_PARTITION;
- sf->search_type_check_frequency = 50;
+ // Turn on this to use non-RD key frame coding mode.
+ sf->use_nonrd_pick_mode = 1;
sf->mv.search_method = NSTEP;
-
sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
-
- // This feature is only enabled when partition search is disabled.
- sf->reuse_inter_pred_sby = 1;
-
- // Increase mode checking threshold for NEWMV.
- sf->elevate_newmv_thresh = 1000;
-
sf->mv.reduce_first_step_size = 1;
+ sf->skip_encode_sb = 0;
}
if (speed >= 7) {
+ sf->adaptive_rd_thresh = 3;
sf->mv.search_method = FAST_DIAMOND;
sf->mv.fullpel_search_step_param = 10;
+ }
+ if (speed >= 8) {
+ sf->adaptive_rd_thresh = 4;
+ sf->mv.subpel_force_stop = 2;
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
- sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
- 800 : 300;
- sf->elevate_newmv_thresh = 2500;
}
+}
- if (speed >= 12) {
- sf->elevate_newmv_thresh = 4000;
- sf->mv.subpel_force_stop = 2;
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ RD_OPT *const rd = &cpi->rd;
+ int i;
+
+ if (oxcf->mode == REALTIME) {
+ set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+ } else if (oxcf->mode == GOOD) {
+ set_good_speed_feature_framesize_dependent(cm, sf, oxcf->speed);
}
- if (speed >= 13) {
- int i;
- sf->max_intra_bsize = BLOCK_32X32;
- for (i = 0; i < BLOCK_SIZES; ++i)
- sf->inter_mode_mask[i] = INTER_NEAREST;
+ if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+ sf->adaptive_pred_interp_filter = 0;
+ }
+
+ if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+ sf->encode_breakout_thresh > cpi->encode_breakout) {
+ cpi->encode_breakout = sf->encode_breakout_thresh;
+ }
+
+ // Check for masked out split cases.
+ for (i = 0; i < MAX_REFS; ++i) {
+ if (sf->disable_split_mask & (1 << i)) {
+ rd->thresh_mult_sub8x8[i] = INT_MAX;
+ }
}
}
-void vp9_set_speed_features(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
SPEED_FEATURES *const sf = &cpi->sf;
VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
int i;
@@ -344,11 +423,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->mv.subpel_force_stop = 0;
sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
sf->mv.reduce_first_step_size = 0;
+ sf->coeff_prob_appx_step = 1;
sf->mv.auto_mv_step_size = 0;
sf->mv.fullpel_search_step_param = 6;
sf->comp_inter_joint_search_thresh = BLOCK_4X4;
sf->adaptive_rd_thresh = 0;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
sf->tx_size_search_method = USE_FULL_RD;
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
@@ -364,11 +443,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->less_rectangular_check = 0;
sf->use_square_partition_only = 0;
sf->auto_min_max_partition_size = NOT_IN_USE;
- sf->max_partition_size = BLOCK_64X64;
- sf->min_partition_size = BLOCK_4X4;
+ sf->rd_auto_partition_min_limit = BLOCK_4X4;
+ sf->default_max_partition_size = BLOCK_64X64;
+ sf->default_min_partition_size = BLOCK_4X4;
sf->adjust_partitioning_from_last_frame = 0;
sf->last_partitioning_redo_frequency = 4;
- sf->constrain_copy_partition = 0;
sf->disable_split_mask = 0;
sf->mode_search_skip_flags = 0;
sf->force_frame_boost = 0;
@@ -400,8 +479,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->always_this_block_size = BLOCK_16X16;
sf->search_type_check_frequency = 50;
sf->encode_breakout_thresh = 0;
- sf->elevate_newmv_thresh = 0;
- // Recode loop tolerence %.
+ // Recode loop tolerance %.
sf->recode_tolerance = 25;
sf->default_interp_filter = SWITCHABLE;
sf->tx_size_search_breakout = 0;
@@ -416,8 +494,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
cpi->full_search_sad = vp9_full_search_sad;
cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
: vp9_diamond_search_sad;
- cpi->refining_search_sad = vp9_refining_search_sad;
-
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
@@ -440,16 +516,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
}
- cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+ x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
- if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
- sf->adaptive_pred_interp_filter = 0;
+ x->min_partition_size = sf->default_min_partition_size;
+ x->max_partition_size = sf->default_max_partition_size;
if (!cpi->oxcf.frame_periodic_boost) {
sf->max_delta_qindex = 0;
}
-
- if (cpi->encode_breakout && oxcf->mode == REALTIME &&
- sf->encode_breakout_thresh > cpi->encode_breakout)
- cpi->encode_breakout = sf->encode_breakout_thresh;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
index 951b4af2276..8575638d9a7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -93,12 +93,6 @@ typedef enum {
} MOTION_THRESHOLD;
typedef enum {
- LAST_FRAME_PARTITION_OFF = 0,
- LAST_FRAME_PARTITION_LOW_MOTION = 1,
- LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef enum {
USE_FULL_RD = 0,
USE_LARGESTALL,
USE_TX_8X8
@@ -107,8 +101,7 @@ typedef enum {
typedef enum {
NOT_IN_USE = 0,
RELAXED_NEIGHBORING_MIN_MAX = 1,
- CONSTRAIN_NEIGHBORING_MIN_MAX = 2,
- STRICT_NEIGHBORING_MIN_MAX = 3
+ STRICT_NEIGHBORING_MIN_MAX = 2
} AUTO_MIN_MAX_MODE;
typedef enum {
@@ -169,12 +162,9 @@ typedef enum {
// before the final run.
TWO_LOOP = 0,
- // No dry run conducted.
- ONE_LOOP = 1,
-
// No dry run, also only half the coef contexts and bands are updated.
// The rest are not updated at all.
- ONE_LOOP_REDUCED = 2
+ ONE_LOOP_REDUCED = 1
} FAST_COEFF_UPDATE;
typedef struct MV_SPEED_FEATURES {
@@ -242,14 +232,8 @@ typedef struct SPEED_FEATURES {
// level within a frame.
int allow_skip_recode;
- // This variable allows us to reuse the last frames partition choices
- // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
- // frame as a starting point in low motion scenes or always use it. If set
- // we use last partitioning_redo frequency to determine how often to redo
- // the partitioning from scratch. Adjust_partitioning_from_last_frame
- // enables us to adjust up or down one partitioning from the last frames
- // partitioning.
- LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+ // Coefficient probability model approximation step size
+ int coeff_prob_appx_step;
// The threshold is to determine how slow the motino is, it is used when
// use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
@@ -264,8 +248,6 @@ typedef struct SPEED_FEATURES {
// precise but significantly faster than the non lp version.
int use_lp32x32fdct;
- // TODO(JBB): remove this as its no longer used.
-
// After looking at the first set of modes (set by index here), skip
// checking modes for reference frames that don't match the reference frame
// of the best so far.
@@ -289,11 +271,14 @@ typedef struct SPEED_FEATURES {
// Sets min and max partition sizes for this 64x64 region based on the
// same 64x64 in last encoded frame, and the left and above neighbor.
AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+ // Ensures the rd based auto partition search will always
+ // go down at least to the specified level.
+ BLOCK_SIZE rd_auto_partition_min_limit;
// Min and max partition size we enable (block_size) as per auto
// min max, but also used by adjust partitioning, and pick_partitioning.
- BLOCK_SIZE min_partition_size;
- BLOCK_SIZE max_partition_size;
+ BLOCK_SIZE default_min_partition_size;
+ BLOCK_SIZE default_max_partition_size;
// Whether or not we allow partitions one smaller or one greater than the last
// frame's partitioning. Only used if use_lastframe_partitioning is set.
@@ -303,12 +288,6 @@ typedef struct SPEED_FEATURES {
// use_lastframe_partitioning is set.
int last_partitioning_redo_frequency;
- // This enables constrained copy partitioning, which, given an input block
- // size bsize, will copy previous partition for partitions less than bsize,
- // otherwise bsize partition is used. bsize is currently set to 16x16.
- // Used for the case where motion is detected in superblock.
- int constrain_copy_partition;
-
// Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
// it always, to allow it for only Last frame and Intra, disable it for all
// inter modes or to enable it always.
@@ -342,10 +321,6 @@ typedef struct SPEED_FEATURES {
// Fast quantization process path
int use_quant_fp;
- // Search through variable block partition types in non-RD mode decision
- // encoding process for RTC.
- int partition_check;
-
// Use finer quantizer in every other few frames that run variable block
// partition type search.
int force_frame_boost;
@@ -367,6 +342,10 @@ typedef struct SPEED_FEATURES {
int intra_y_mode_mask[TX_SIZES];
int intra_uv_mode_mask[TX_SIZES];
+ // These bit masks allow you to enable or disable intra modes for each
+ // prediction block size separately.
+ int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
// This variable enables an early break out of mode testing if the model for
// rd built from the prediction signal indicates a value that's much
// higher than the best rd we've seen so far.
@@ -417,9 +396,6 @@ typedef struct SPEED_FEATURES {
// enabled in real time mode.
int encode_breakout_thresh;
- // In real time encoding, increase the threshold for NEWMV.
- int elevate_newmv_thresh;
-
// default interp filter choice
INTERP_FILTER default_interp_filter;
@@ -443,11 +419,11 @@ typedef struct SPEED_FEATURES {
struct VP9_COMP;
-void vp9_set_speed_features(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP9_ENCODER_VP9_SPEED_FEATURES_H_
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
index 5dbfbf53bbc..88db5dda06d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <math.h>
#include "./vp9_rtcd.h"
-
#include "vp9/encoder/vp9_ssim.h"
void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
@@ -201,6 +201,251 @@ double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
return ssim_all;
}
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+double ssimv_similarity(Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+ (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+ // Since these variables are unsigned sums, convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+ / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r
+ - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side. check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+double ssimv_similarity2(Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+ const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+ // Since these variables are unsigned, sums convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+ / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch,
+ Ssimv *sv) {
+ vp9_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
+ &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
+ &sv->sum_sxr);
+}
+
+double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+ uint8_t *img2, int img2_pitch,
+ int width, int height,
+ Ssimv *sv2, Metrics *m,
+ int do_inconsistency) {
+ double dssim_total = 0;
+ double ssim_total = 0;
+ double ssim2_total = 0;
+ double inconsistency_total = 0;
+ int i, j;
+ int c = 0;
+ double norm;
+ double old_ssim_total = 0;
+ vp9_clear_system_state();
+ // We can sample points as frequently as we like start with 1 per 4x4.
+ for (i = 0; i < height; i += 4,
+ img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4, ++c) {
+ Ssimv sv = {0};
+ double ssim;
+ double ssim2;
+ double dssim;
+ uint32_t var_new;
+ uint32_t var_old;
+ uint32_t mean_new;
+ uint32_t mean_old;
+ double ssim_new;
+ double ssim_old;
+
+ // Not sure there's a great way to handle the edge pixels
+ // in ssim when using a window. Seems biased against edge pixels
+ // however you handle this. This uses only samples that are
+ // fully in the frame.
+ if (j + 8 <= width && i + 8 <= height) {
+ ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+ }
+
+ ssim = ssimv_similarity(&sv, 64);
+ ssim2 = ssimv_similarity2(&sv, 64);
+
+ sv.ssim = ssim2;
+
+ // dssim is calculated to use as an actual error metric and
+ // is scaled up to the same range as sum square error.
+ // Since we are subsampling every 16th point maybe this should be
+ // *16 ?
+ dssim = 255 * 255 * (1 - ssim2) / 2;
+
+ // Here I introduce a new error metric: consistency-weighted
+ // SSIM-inconsistency. This metric isolates frames where the
+ // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+ // sharper or blurrier than the others. Higher values indicate a
+ // temporally inconsistent SSIM. There are two ideas at work:
+ //
+ // 1) 'SSIM-inconsistency': the total inconsistency value
+ // reflects how much SSIM values are changing between this
+ // source / reference frame pair and the previous pair.
+ //
+ // 2) 'consistency-weighted': weights de-emphasize areas in the
+ // frame where the scene content has changed. Changes in scene
+ // content are detected via changes in local variance and local
+ // mean.
+ //
+ // Thus the overall measure reflects how inconsistent the SSIM
+ // values are, over consistent regions of the frame.
+ //
+ // The metric has three terms:
+ //
+ // term 1 -> uses change in scene Variance to weight error score
+ // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term 2 -> uses change in local scene luminance to weight error
+ // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term3 -> measures inconsistency in ssim scores between frames
+ // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+ //
+ // This term compares the ssim score for the same location in 2
+ // subsequent frames.
+ var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+ var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+ mean_new = sv.sum_s;
+ mean_old = sv2[c].sum_s;
+ ssim_new = sv.ssim;
+ ssim_old = sv2[c].ssim;
+
+ if (do_inconsistency) {
+ // We do the metric once for every 4x4 block in the image. Since
+ // we are scaling the error to SSE for use in a psnr calculation
+ // 1.0 = 4x4x255x255 the worst error we can possibly have.
+ static const double kScaling = 4. * 4 * 255 * 255;
+
+ // The constants have to be non 0 to avoid potential divide by 0
+ // issues other than that they affect kind of a weighting between
+ // the terms. No testing of what the right terms should be has been
+ // done.
+ static const double c1 = 1, c2 = 1, c3 = 1;
+
+ // This measures how much consistent variance is in two consecutive
+ // source frames. 1.0 means they have exactly the same variance.
+ const double variance_term = (2.0 * var_old * var_new + c1) /
+ (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+ // This measures how consistent the local mean are between two
+ // consecutive frames. 1.0 means they have exactly the same mean.
+ const double mean_term = (2.0 * mean_old * mean_new + c2) /
+ (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+ // This measures how consistent the ssims of two
+ // consecutive frames is. 1.0 means they are exactly the same.
+ double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) /
+ (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+ 5);
+
+ double this_inconsistency;
+
+ // Floating point math sometimes makes this > 1 by a tiny bit.
+ // We want the metric to scale between 0 and 1.0 so we can convert
+ // it to an snr scaled value.
+ if (ssim_term > 1)
+ ssim_term = 1;
+
+ // This converts the consistency metric to an inconsistency metric
+ // ( so we can scale it like psnr to something like sum square error.
+ // The reason for the variance and mean terms is the assumption that
+ // if there are big changes in the source we shouldn't penalize
+ // inconsistency in ssim scores a bit less as it will be less visible
+ // to the user.
+ this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+ this_inconsistency *= kScaling;
+ inconsistency_total += this_inconsistency;
+ }
+ sv2[c] = sv;
+ ssim_total += ssim;
+ ssim2_total += ssim2;
+ dssim_total += dssim;
+
+ old_ssim_total += ssim_old;
+ }
+ old_ssim_total += 0;
+ }
+
+ norm = 1. / (width / 4) / (height / 4);
+ ssim_total *= norm;
+ ssim2_total *= norm;
+ m->ssim2 = ssim2_total;
+ m->ssim = ssim_total;
+ if (old_ssim_total == 0)
+ inconsistency_total = 0;
+
+ m->ssimc = inconsistency_total;
+
+ m->dssim = dssim_total;
+ return inconsistency_total;
+}
+
+
#if CONFIG_VP9_HIGHBITDEPTH
double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
index e75623b2545..10f14c4d268 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
@@ -17,12 +17,64 @@ extern "C" {
#include "vpx_scale/yv12config.h"
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+ // source sum ( over 8x8 region )
+ uint64_t sum_s;
+
+ // reference sum (over 8x8 region )
+ uint64_t sum_r;
+
+ // source sum squared ( over 8x8 region )
+ uint64_t sum_sq_s;
+
+ // reference sum squared (over 8x8 region )
+ uint64_t sum_sq_r;
+
+ // sum of source times reference (over 8x8 region)
+ uint64_t sum_sxr;
+
+ // calculated ssim score between source and reference
+ double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+ // ssim consistency error metric ( see code for explanation )
+ double ssimc;
+
+ // standard ssim
+ double ssim;
+
+ // revised ssim ( see code for explanation)
+ double ssim2;
+
+ // ssim restated as an error metric like sse
+ double dssim;
+
+ // dssim converted to decibels
+ double dssimd;
+
+ // ssimc converted to decibels
+ double ssimcd;
+} Metrics;
+
+double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency);
+
double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double *weight);
double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double *ssim_y, double *ssim_u, double *ssim_v);
+double vp9_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+ double *ssim_y, double *ssim_u, double *ssim_v);
+
+double vp9_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+ double *ssim_y, double *ssim_u, double *ssim_v);
+
#if CONFIG_VP9_HIGHBITDEPTH
double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
index 530b5923bed..cfdc90d15fb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
@@ -140,12 +140,13 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
const vp9_prob *oldp,
vp9_prob *bestp,
- vp9_prob upd) {
+ vp9_prob upd,
+ int stepsize) {
int i, old_b, new_b, update_b, savings, bestsavings, step;
int newp;
vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
vp9_model_to_full_probs(oldp, oldplist);
- vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+ memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
old_b += cost_branch256(ct + 2 * i, oldplist[i]);
old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
@@ -153,24 +154,44 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
bestsavings = 0;
bestnewp = oldp[PIVOT_NODE];
- step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
-
- for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) {
- if (newp < 1 || newp > 255)
- continue;
- newplist[PIVOT_NODE] = newp;
- vp9_model_to_full_probs(newplist, newplist);
- for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
- new_b += cost_branch256(ct + 2 * i, newplist[i]);
- new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
- update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
- vp9_cost_upd256;
- savings = old_b - new_b - update_b;
- if (savings > bestsavings) {
- bestsavings = savings;
- bestnewp = newp;
+ if (*bestp > oldp[PIVOT_NODE]) {
+ step = -stepsize;
+ for (newp = *bestp; newp > oldp[PIVOT_NODE]; newp += step) {
+ if (newp < 1 || newp > 255)
+ continue;
+ newplist[PIVOT_NODE] = newp;
+ vp9_model_to_full_probs(newplist, newplist);
+ for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+ new_b += cost_branch256(ct + 2 * i, newplist[i]);
+ new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+ update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+ vp9_cost_upd256;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ } else {
+ step = stepsize;
+ for (newp = *bestp; newp < oldp[PIVOT_NODE]; newp += step) {
+ if (newp < 1 || newp > 255)
+ continue;
+ newplist[PIVOT_NODE] = newp;
+ vp9_model_to_full_probs(newplist, newplist);
+ for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+ new_b += cost_branch256(ct + 2 * i, newplist[i]);
+ new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+ update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+ vp9_cost_upd256;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
}
}
+
*bestp = bestnewp;
return bestsavings;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
index 8e02a1d0d5e..ac54893cf45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
@@ -30,7 +30,8 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
const vp9_prob *oldp,
vp9_prob *bestp,
- vp9_prob upd);
+ vp9_prob upd,
+ int stepsize);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 1573557d471..b3491a27a4a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -39,13 +39,15 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
cpi->common.use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+ VP9_ENC_BORDER_IN_PIXELS,
+ cpi->common.byte_alignment,
+ NULL, NULL, NULL))
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to allocate empty frame for multiple frame "
"contexts");
- vpx_memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
- cpi->svc.empty_frame.img.buffer_alloc_sz);
+ memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
+ cpi->svc.empty_frame.img.buffer_alloc_sz);
cpi->svc.empty_frame_width = cpi->common.width;
cpi->svc.empty_frame_height = cpi->common.height;
}
@@ -77,6 +79,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+ lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
} else {
lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
@@ -85,11 +88,11 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
oxcf->best_allowed_q) / 2;
lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
oxcf->best_allowed_q) / 2;
- if (oxcf->ss_play_alternate[layer])
+ if (oxcf->ss_enable_auto_arf[layer])
lc->alt_ref_idx = alt_ref_idx++;
else
- lc->alt_ref_idx = -1;
- lc->gold_ref_idx = -1;
+ lc->alt_ref_idx = INVALID_IDX;
+ lc->gold_ref_idx = INVALID_IDX;
}
lrc->buffer_level = oxcf->starting_buffer_level_ms *
@@ -192,7 +195,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
oxcf->two_pass_vbrmin_section / 100);
lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
oxcf->two_pass_vbrmax_section) / 100);
- vp9_rc_set_gf_max_interval(cpi, lrc);
+ vp9_rc_set_gf_interval_range(cpi, lrc);
}
void vp9_restore_layer_context(VP9_COMP *const cpi) {
@@ -276,6 +279,7 @@ static void get_layer_resolution(const int width_org, const int height_org,
int vp9_svc_start_frame(VP9_COMP *const cpi) {
int width = 0, height = 0;
LAYER_CONTEXT *lc;
+ struct lookahead_entry *buf;
int count = 1 << (cpi->svc.number_temporal_layers - 1);
cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
@@ -305,7 +309,7 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
}
} else {
- if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) {
+ if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
cpi->alt_fb_idx = lc->alt_ref_idx;
if (!lc->has_alt_frame)
cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
@@ -317,7 +321,7 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
LAYER_CONTEXT *lc_lower =
&cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
- if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id - 1] &&
+ if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
lc_lower->alt_ref_source != NULL)
cpi->alt_fb_idx = lc_lower->alt_ref_idx;
else if (cpi->svc.spatial_layer_id >= 2)
@@ -336,8 +340,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
// since its previous frame could be changed during decoding time. The idea is
// we put a empty invisible frame in front of them, then we will not use
// prev_mi when encoding these frames.
+
+ buf = vp9_lookahead_peek(cpi->lookahead, 0);
if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
- cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) {
+ cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
+ lc->rc.frames_to_key != 0 &&
+ !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
if ((cpi->svc.number_temporal_layers > 1 &&
cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
(cpi->svc.number_spatial_layers > 1 &&
@@ -372,13 +380,14 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
}
}
- if (vp9_set_size_literal(cpi, width, height) != 0)
- return VPX_CODEC_INVALID_PARAM;
-
cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q);
cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
vp9_change_config(cpi, &cpi->oxcf);
+
+ if (vp9_set_size_literal(cpi, width, height) != 0)
+ return VPX_CODEC_INVALID_PARAM;
+
vp9_set_high_precision_mv(cpi, 1);
cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 5599227ce77..d7979ab53a5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -44,7 +44,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
const int which_mv = 0;
const MV mv = { mv_row, mv_col };
const InterpKernel *const kernel =
- vp9_get_interp_kernel(xd->mi[0].src_mi->mbmi.interp_filter);
+ vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter);
enum mv_precision mv_precision_uv;
int uv_stride;
@@ -213,7 +213,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
uint8_t *arf_frame_buf,
uint8_t *frame_ptr_buf,
int stride) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
int step_param;
@@ -225,7 +225,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
MV best_ref_mv1 = {0, 0};
MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
- MV *ref_mv = &x->e_mbd.mi[0].src_mi->bmi[0].as_mv[0].as_mv;
+ MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
// Save input state
struct buf_2d src = x->plane[0].src;
@@ -280,17 +280,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
int mb_y_offset = 0;
int mb_uv_offset = 0;
- DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
- DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
- MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+ DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+ MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
uint8_t *dst1, *dst2;
#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED_ARRAY(16, uint16_t, predictor16, 16 * 16 * 3);
- DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor8, 16 * 16 * 3);
+ DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
+ DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
uint8_t *predictor;
#else
- DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
+ DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
#endif
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
@@ -321,19 +321,19 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
// 8 - VP9_INTERP_EXTEND.
// To keep the mv in play for both Y and UV planes the max that it
// can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
- cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
- cpi->mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
+ cpi->td.mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+ cpi->td.mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
+ (17 - 2 * VP9_INTERP_EXTEND);
for (mb_col = 0; mb_col < mb_cols; mb_col++) {
int i, j, k;
int stride;
- vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
- vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+ memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+ memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
- cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
- cpi->mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
+ cpi->td.mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+ cpi->td.mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
+ (17 - 2 * VP9_INTERP_EXTEND);
for (frame = 0; frame < frame_count; frame++) {
@@ -343,8 +343,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
if (frames[frame] == NULL)
continue;
- mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.row = 0;
- mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.col = 0;
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
if (frame == alt_ref_index) {
filter_weight = 2;
@@ -370,8 +370,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
frames[frame]->v_buffer + mb_uv_offset,
frames[frame]->y_stride,
mb_uv_width, mb_uv_height,
- mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.row,
- mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.col,
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
predictor, scale,
mb_col * 16, mb_row * 16);
@@ -653,6 +653,7 @@ static void adjust_arnr_filter(VP9_COMP *cpi,
void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
int frame;
int frames_to_blur;
int start_frame;
@@ -709,8 +710,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
- NULL)) {
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment,
+ NULL, NULL, NULL)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
}
@@ -720,8 +722,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
}
}
cm->mi = cm->mip + cm->mi_stride + 1;
- cpi->mb.e_mbd.mi = cm->mi;
- cpi->mb.e_mbd.mi[0].src_mi = &cpi->mb.e_mbd.mi[0];
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
} else {
// ARF is produced at the native frame size and resized when coded.
#if CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
index adf01bf35ba..862be4d384a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
@@ -23,22 +23,32 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_tokenize.h"
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static TOKENVALUE dct_value_tokens_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
-static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const int16_t *vp9_dct_value_cost_high10_ptr;
-
-static TOKENVALUE dct_value_tokens_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
-static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const int16_t *vp9_dct_value_cost_high12_ptr;
-#endif
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+ {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+ {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+ {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+ {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+ {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+ {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+ {8, 7}, {8, 5}, {8, 3}, {8, 1},
+ {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+ {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+ {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+ {1, 0}, {2, 0}, {3, 0}, {4, 0},
+ {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+ {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+ {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+ {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+ {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+ {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+ {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+ {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+ {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+ {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+ (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+ / 2;
// Array indices are identical to previously-existing CONTEXT_NODE indices
const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -55,204 +65,390 @@ const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE
};
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
- 2, 6, // 0 = LOW_VAL
- -TWO_TOKEN, 4, // 1 = TWO
- -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE
- 8, 10, // 3 = HIGH_LOW
- -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE
- 12, 14, // 5 = CAT_THREEFOUR
- -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE
- -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE
+static const vp9_tree_index cat1[2] = {0, 0};
+static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+ 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
+
+static const int16_t zero_cost[] = {0};
+static const int16_t one_cost[] = {255, 257};
+static const int16_t two_cost[] = {255, 257};
+static const int16_t three_cost[] = {255, 257};
+static const int16_t four_cost[] = {255, 257};
+static const int16_t cat1_cost[] = {429, 431, 616, 618};
+static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
+static const int16_t cat3_cost[] = {
+ 820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
+ 1289, 1291
+};
+static const int16_t cat4_cost[] = {
+ 1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
+ 1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
+ 1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
+};
+static const int16_t cat5_cost[] = {
+ 1269, 1271, 1283, 1285, 1306, 1308, 1320,
+ 1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
+ 1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
+ 1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
+ 1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
+ 1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
+};
+const int16_t vp9_cat6_low_cost[256] = {
+ 1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
+ 1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
+ 1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
+ 1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
+ 1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
+ 1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
+ 1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
+ 2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
+ 2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
+ 2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
+ 2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
+ 2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
+ 2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
+ 2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
+ 2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
+ 2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
+ 2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
+ 2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
+ 2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
+ 2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
+ 2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
+ 2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
+};
+const int16_t vp9_cat6_high_cost[128] = {
+ 72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
+ 1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
+ 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+ 2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
+ 5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+ 5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
+ 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+ 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
+ 7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+ 5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
+ 7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
};
-
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
#if CONFIG_VP9_HIGHBITDEPTH
-static vp9_tree_index cat1_high10[2];
-static vp9_tree_index cat2_high10[4];
-static vp9_tree_index cat3_high10[6];
-static vp9_tree_index cat4_high10[8];
-static vp9_tree_index cat5_high10[10];
-static vp9_tree_index cat6_high10[32];
-static vp9_tree_index cat1_high12[2];
-static vp9_tree_index cat2_high12[4];
-static vp9_tree_index cat3_high12[6];
-static vp9_tree_index cat4_high12[8];
-static vp9_tree_index cat5_high12[10];
-static vp9_tree_index cat6_high12[36];
+const int16_t vp9_cat6_high10_high_cost[512] = {
+ 74, 894, 1185, 2005, 1450, 2270, 2561,
+ 3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
+ 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+ 7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+ 5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
+ 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+ 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
+ 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+ 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+ 11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+ 5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
+ 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+ 9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+ 9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+ 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+ 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+ 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+ 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
+ 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
+ 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+ 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
+ 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
+ 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+ 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+ 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
+ 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
+ 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+ 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
+ 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
+ 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+ 9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+ 9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
+ 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
+ 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
+ 12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
+ 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+ 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
+ 9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
+ 11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
+ 13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
+};
+const int16_t vp9_cat6_high12_high_cost[2048] = {
+ 76, 896, 1187, 2007, 1452, 2272, 2563,
+ 3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
+ 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+ 7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+ 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
+ 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+ 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+ 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+ 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
+ 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
+ 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+ 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
+ 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
+ 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+ 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+ 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
+ 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
+ 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+ 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+ 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+ 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+ 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
+ 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
+ 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+ 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
+ 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+ 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+ 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+ 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
+ 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
+ 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+ 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
+ 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
+ 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+ 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
+ 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
+ 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+ 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
+ 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
+ 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
+ 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+ 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+ 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+ 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+ 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+ 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
+ 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+ 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
+ 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+ 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+ 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
+ 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+ 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+ 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
+ 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
+ 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+ 15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
+ 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+ 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+ 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+ 12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+ 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+ 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+ 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
+ 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+ 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+ 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+ 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+ 13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
+ 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
+ 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+ 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+ 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+ 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+ 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
+ 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+ 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+ 13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
+ 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+ 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+ 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+ 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+ 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+ 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+ 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+ 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+ 17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+ 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+ 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
+ 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+ 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+ 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+ 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+ 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+ 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+ 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+ 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+ 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
+ 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+ 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
+ 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+ 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
+ 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+ 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
+ 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+ 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+ 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
+ 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
+ 16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+ 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+ 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+ 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+ 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+ 17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+ 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
+ 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
+ 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+ 15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
+ 16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
+};
#endif
-static void init_bit_tree(vp9_tree_index *p, int n) {
- int i = 0;
-
- while (++i < n) {
- p[0] = p[1] = i << 1;
- p += 2;
- }
-
- p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
- init_bit_tree(cat1, 1);
- init_bit_tree(cat2, 2);
- init_bit_tree(cat3, 3);
- init_bit_tree(cat4, 4);
- init_bit_tree(cat5, 5);
- init_bit_tree(cat6, 14);
#if CONFIG_VP9_HIGHBITDEPTH
- init_bit_tree(cat1_high10, 1);
- init_bit_tree(cat2_high10, 2);
- init_bit_tree(cat3_high10, 3);
- init_bit_tree(cat4_high10, 4);
- init_bit_tree(cat5_high10, 5);
- init_bit_tree(cat6_high10, 16);
- init_bit_tree(cat1_high12, 1);
- init_bit_tree(cat2_high12, 2);
- init_bit_tree(cat3_high12, 3);
- init_bit_tree(cat4_high12, 4);
- init_bit_tree(cat5_high12, 5);
- init_bit_tree(cat6_high12, 18);
+static const vp9_tree_index cat1_high10[2] = {0, 0};
+static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+ 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+ 30, 30, 0, 0};
+static const vp9_tree_index cat1_high12[2] = {0, 0};
+static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+ 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+ 30, 30, 32, 32, 34, 34, 0, 0};
#endif
-}
const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN
- {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN
- {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN
- {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN
- {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN
- {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL}, // CATEGORY6_TOKEN
- {0, 0, 0, 0} // EOB_TOKEN
+ {0, 0, 0, 0, zero_cost}, // ZERO_TOKEN
+ {0, 0, 0, 1, one_cost}, // ONE_TOKEN
+ {0, 0, 0, 2, two_cost}, // TWO_TOKEN
+ {0, 0, 0, 3, three_cost}, // THREE_TOKEN
+ {0, 0, 0, 4, four_cost}, // FOUR_TOKEN
+ {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost}, // CATEGORY1_TOKEN
+ {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost}, // CATEGORY2_TOKEN
+ {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost}, // CATEGORY3_TOKEN
+ {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost}, // CATEGORY4_TOKEN
+ {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost}, // CATEGORY5_TOKEN
+ {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0}, // CATEGORY6_TOKEN
+ {0, 0, 0, 0, zero_cost} // EOB_TOKEN
};
#if CONFIG_VP9_HIGHBITDEPTH
const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1_high10, vp9_cat1_prob_high10, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN
- {cat2_high10, vp9_cat2_prob_high10, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN
- {cat3_high10, vp9_cat3_prob_high10, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN
- {cat4_high10, vp9_cat4_prob_high10, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN
- {cat5_high10, vp9_cat5_prob_high10, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN
- {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL}, // CATEGORY6_TOKEN
- {0, 0, 0, 0} // EOB_TOKEN
+ {0, 0, 0, 0, zero_cost}, // ZERO
+ {0, 0, 0, 1, one_cost}, // ONE
+ {0, 0, 0, 2, two_cost}, // TWO
+ {0, 0, 0, 3, three_cost}, // THREE
+ {0, 0, 0, 4, four_cost}, // FOUR
+ {cat1_high10, vp9_cat1_prob_high10, 1, CAT1_MIN_VAL, cat1_cost}, // CAT1
+ {cat2_high10, vp9_cat2_prob_high10, 2, CAT2_MIN_VAL, cat2_cost}, // CAT2
+ {cat3_high10, vp9_cat3_prob_high10, 3, CAT3_MIN_VAL, cat3_cost}, // CAT3
+ {cat4_high10, vp9_cat4_prob_high10, 4, CAT4_MIN_VAL, cat4_cost}, // CAT4
+ {cat5_high10, vp9_cat5_prob_high10, 5, CAT5_MIN_VAL, cat5_cost}, // CAT5
+ {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0}, // CAT6
+ {0, 0, 0, 0, zero_cost} // EOB
};
const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1_high12, vp9_cat1_prob_high12, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN
- {cat2_high12, vp9_cat2_prob_high12, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN
- {cat3_high12, vp9_cat3_prob_high12, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN
- {cat4_high12, vp9_cat4_prob_high12, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN
- {cat5_high12, vp9_cat5_prob_high12, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN
- {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL}, // CATEGORY6_TOKEN
- {0, 0, 0, 0} // EOB_TOKEN
+ {0, 0, 0, 0, zero_cost}, // ZERO
+ {0, 0, 0, 1, one_cost}, // ONE
+ {0, 0, 0, 2, two_cost}, // TWO
+ {0, 0, 0, 3, three_cost}, // THREE
+ {0, 0, 0, 4, four_cost}, // FOUR
+ {cat1_high12, vp9_cat1_prob_high12, 1, CAT1_MIN_VAL, cat1_cost}, // CAT1
+ {cat2_high12, vp9_cat2_prob_high12, 2, CAT2_MIN_VAL, cat2_cost}, // CAT2
+ {cat3_high12, vp9_cat3_prob_high12, 3, CAT3_MIN_VAL, cat3_cost}, // CAT3
+ {cat4_high12, vp9_cat4_prob_high12, 4, CAT4_MIN_VAL, cat4_cost}, // CAT4
+ {cat5_high12, vp9_cat5_prob_high12, 5, CAT5_MIN_VAL, cat5_cost}, // CAT5
+ {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0}, // CAT6
+ {0, 0, 0, 0, zero_cost} // EOB
};
#endif
-struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
-
-void vp9_coef_tree_initialize() {
- init_bit_trees();
- vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
-static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e,
- int16_t *value_cost, int max_value) {
- int i = -max_value;
- int sign = 1;
-
- do {
- if (!i)
- sign = 0;
-
- {
- const int a = sign ? -i : i;
- int eb = sign;
-
- if (a > 4) {
- int j = 4;
-
- while (++j < 11 && e[j].base_val <= a) {}
-
- t[i].token = --j;
- eb |= (a - e[j].base_val) << 1;
- } else {
- t[i].token = a;
- }
- t[i].extra = eb;
- }
-
- // initialize the cost for extra bits for all possible coefficient value.
- {
- int cost = 0;
- const vp9_extra_bit *p = &e[t[i].token];
-
- if (p->base_val) {
- const int extra = t[i].extra;
- const int length = p->len;
-
- if (length)
- cost += treed_cost(p->tree, p->prob, extra >> 1, length);
-
- cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
- value_cost[i] = cost;
- }
- }
- } while (++i < max_value);
-}
-
-void vp9_tokenize_initialize() {
- vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
- vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = {
+ {2, 2}, {6, 3}, {28, 5}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, {124, 7},
+ {125, 7}, {126, 7}, {127, 7}, {0, 1}
+};
- tokenize_init_one(dct_value_tokens + DCT_MAX_VALUE, vp9_extra_bits,
- dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE);
-#if CONFIG_VP9_HIGHBITDEPTH
- vp9_dct_value_tokens_high10_ptr = dct_value_tokens_high10 +
- DCT_MAX_VALUE_HIGH10;
- vp9_dct_value_cost_high10_ptr = dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-
- tokenize_init_one(dct_value_tokens_high10 + DCT_MAX_VALUE_HIGH10,
- vp9_extra_bits_high10,
- dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10,
- DCT_MAX_VALUE_HIGH10);
- vp9_dct_value_tokens_high12_ptr = dct_value_tokens_high12 +
- DCT_MAX_VALUE_HIGH12;
- vp9_dct_value_cost_high12_ptr = dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
-
- tokenize_init_one(dct_value_tokens_high12 + DCT_MAX_VALUE_HIGH12,
- vp9_extra_bits_high12,
- dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12,
- DCT_MAX_VALUE_HIGH12);
-#endif
-}
struct tokenize_b_args {
VP9_COMP *cpi;
- MACROBLOCKD *xd;
+ ThreadData *td;
TOKENEXTRA **tp;
};
static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
- MACROBLOCKD *const xd = args->xd;
- struct macroblock_plane *p = &args->cpi->mb.plane[plane];
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -294,12 +490,14 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
VP9_COMP *cpi = args->cpi;
- MACROBLOCKD *xd = args->xd;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
TOKENEXTRA **tp = args->tp;
uint8_t token_cache[32 * 32];
- struct macroblock_plane *p = &cpi->mb.plane[plane];
+ struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
- MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
int pt; /* near block/prev token context index */
int c;
TOKENEXTRA *t = *tp; /* store tokens starting here */
@@ -311,15 +509,15 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
const scan_order *so;
const int ref = is_inter_block(mbmi);
unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
- cpi->coef_counts[tx_size][type][ref];
+ td->rd_counts.coef_counts[tx_size][type][ref];
vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
- cpi->common.fc.coef_probs[tx_size][type][ref];
+ cpi->common.fc->coef_probs[tx_size][type][ref];
unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
- cpi->common.counts.eob_branch[tx_size][type][ref];
+ td->counts->eob_branch[tx_size][type][ref];
const uint8_t *const band = get_band_translate(tx_size);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
- const TOKENVALUE *dct_value_tokens;
-
+ int16_t token;
+ EXTRABIT extra;
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -329,17 +527,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
scan = so->scan;
nb = so->neighbors;
c = 0;
-#if CONFIG_VP9_HIGHBITDEPTH
- if (cpi->common.profile >= PROFILE_2) {
- dct_value_tokens = (cpi->common.bit_depth == VPX_BITS_10 ?
- vp9_dct_value_tokens_high10_ptr :
- vp9_dct_value_tokens_high12_ptr);
- } else {
- dct_value_tokens = vp9_dct_value_tokens_ptr;
- }
-#else
- dct_value_tokens = vp9_dct_value_tokens_ptr;
-#endif
while (c < eob) {
int v = 0;
@@ -358,14 +545,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
v = qcoeff[scan[c]];
}
- add_token(&t, coef_probs[band[c]][pt],
- dct_value_tokens[v].extra,
- (uint8_t)dct_value_tokens[v].token,
- (uint8_t)skip_eob,
- counts[band[c]][pt]);
+ vp9_get_token_extra(v, &token, &extra);
+
+ add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+ (uint8_t)skip_eob, counts[band[c]][pt]);
eob_branch[band[c]][pt] += !skip_eob;
- token_cache[scan[c]] = vp9_pt_energy_class[dct_value_tokens[v].token];
+ token_cache[scan[c]] = vp9_pt_energy_class[token];
++c;
pt = get_coef_context(nb, token_cache, c);
}
@@ -421,30 +607,27 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
return result;
}
-void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE bsize) {
+void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ int dry_run, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
- TOKENEXTRA *t_backup = *t;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const int ctx = vp9_get_skip_context(xd);
const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
- struct tokenize_b_args arg = {cpi, xd, t};
+ struct tokenize_b_args arg = {cpi, td, t};
if (mbmi->skip) {
if (!dry_run)
- cm->counts.skip[ctx][1] += skip_inc;
+ td->counts->skip[ctx][1] += skip_inc;
reset_skip_context(xd, bsize);
- if (dry_run)
- *t = t_backup;
return;
}
if (!dry_run) {
- cm->counts.skip[ctx][0] += skip_inc;
+ td->counts->skip[ctx][0] += skip_inc;
vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
} else {
vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
- *t = t_backup;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
index 825252bac22..81cc2e13f99 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
@@ -20,41 +20,39 @@
extern "C" {
#endif
-void vp9_tokenize_initialize();
-
#define EOSB_TOKEN 127 // Not signalled, encoder only
-typedef struct {
- int16_t token;
#if CONFIG_VP9_HIGHBITDEPTH
- int32_t extra;
+ typedef int32_t EXTRABIT;
#else
- int16_t extra;
+ typedef int16_t EXTRABIT;
#endif
+
+
+typedef struct {
+ int16_t token;
+ EXTRABIT extra;
} TOKENVALUE;
typedef struct {
const vp9_prob *context_tree;
-#if CONFIG_VP9_HIGHBITDEPTH
- int32_t extra;
-#else
- int16_t extra;
-#endif
- uint8_t token;
- uint8_t skip_eob_node;
+ EXTRABIT extra;
+ uint8_t token;
+ uint8_t skip_eob_node;
} TOKENEXTRA;
extern const vp9_tree_index vp9_coef_tree[];
extern const vp9_tree_index vp9_coef_con_tree[];
-extern struct vp9_token vp9_coef_encodings[];
+extern const struct vp9_token vp9_coef_encodings[];
int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
struct VP9_COMP;
+struct ThreadData;
-void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE bsize);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
+ TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
extern const int16_t *vp9_dct_value_cost_ptr;
/* TODO: The Token field should be broken out into a separate char array to
@@ -62,13 +60,51 @@ extern const int16_t *vp9_dct_value_cost_ptr;
* fields are not.
*/
extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const int16_t vp9_cat6_high_cost[128];
+extern const int16_t vp9_cat6_high10_high_cost[512];
+extern const int16_t vp9_cat6_high12_high_cost[2048];
+static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
+ const int16_t *cat6_high_table) {
+ if (token != CATEGORY6_TOKEN)
+ return vp9_extra_bits[token].cost[extrabits];
+ return vp9_cat6_low_cost[extrabits & 0xff]
+ + cat6_high_table[extrabits >> 8];
+}
+
#if CONFIG_VP9_HIGHBITDEPTH
-extern const int16_t *vp9_dct_value_cost_high10_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
-extern const int16_t *vp9_dct_value_cost_high12_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+ return bit_depth == 8 ? vp9_cat6_high_cost
+ : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
+ vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+ (void) bit_depth;
+ return vp9_cat6_high_cost;
+}
#endif // CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ *token = CATEGORY6_TOKEN;
+ if (v >= CAT6_MIN_VAL)
+ *extra = 2 * v - 2 * CAT6_MIN_VAL;
+ else
+ *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+ return;
+ }
+ *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+ *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+ return 10;
+ return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
index 4555bde1e7e..f38f96d6c26 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
@@ -145,7 +145,7 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
- DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, H * W); \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
\
var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
BILINEAR_FILTERS_2TAP(xoffset)); \
@@ -298,8 +298,8 @@ void highbd_variance(const uint8_t *a8, int a_stride,
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = sse_long;
- *sum = sum_long;
+ *sse = (unsigned int)sse_long;
+ *sum = (int)sum_long;
}
void highbd_10_variance(const uint8_t *a8, int a_stride,
@@ -309,8 +309,8 @@ void highbd_10_variance(const uint8_t *a8, int a_stride,
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sum = ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
}
void highbd_12_variance(const uint8_t *a8, int a_stride,
@@ -320,8 +320,8 @@ void highbd_12_variance(const uint8_t *a8, int a_stride,
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sum = ROUND_POWER_OF_TWO(sum_long, 4);
- *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
}
static void highbd_var_filter_block2d_bil_first_pass(
@@ -464,7 +464,7 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
- DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, BILINEAR_FILTERS_2TAP(xoffset)); \
@@ -486,7 +486,7 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
- DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, BILINEAR_FILTERS_2TAP(xoffset)); \
@@ -508,7 +508,7 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
- DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, BILINEAR_FILTERS_2TAP(xoffset)); \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index ca6cf1ac916..4672aa6b8cf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -11,6 +11,83 @@
#include <emmintrin.h>
#include "vpx_ports/mem.h"
+void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
__m128i s0, s1, u0;
@@ -38,3 +115,307 @@ unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
avg = _mm_extract_epi16(s0, 0);
return (avg + 32) >> 6;
}
+
+unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 8) >> 4;
+}
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ _mm_store_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ + (idx & 0x01) * 8;
+ vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff0 = _mm_srai_epi16(coeff0, 1);
+ coeff1 = _mm_srai_epi16(coeff1, 1);
+ _mm_store_si128((__m128i *)coeff, coeff0);
+ _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ coeff2 = _mm_srai_epi16(coeff2, 1);
+ coeff3 = _mm_srai_epi16(coeff3, 1);
+ _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ }
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+ int i;
+ __m128i sum = _mm_load_si128((const __m128i *)coeff);
+ __m128i sign = _mm_srai_epi16(sum, 15);
+ __m128i val = _mm_xor_si128(sum, sign);
+ sum = _mm_sub_epi16(val, sign);
+ coeff += 8;
+
+ for (i = 8; i < length; i += 8) {
+ __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ sign = _mm_srai_epi16(src_line, 15);
+ val = _mm_xor_si128(src_line, sign);
+ val = _mm_sub_epi16(val, sign);
+ sum = _mm_add_epi16(sum, val);
+ coeff += 8;
+ }
+
+ val = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, val);
+ val = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, val);
+ val = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, val);
+
+ return _mm_extract_epi16(sum, 0);
+}
+
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+ const int ref_stride, const int height) {
+ int idx;
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+ __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+ __m128i t0, t1;
+ int height_1 = height - 1;
+ ref += ref_stride;
+
+ for (idx = 1; idx < height_1; idx += 2) {
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref += ref_stride;
+ }
+
+ src_line = _mm_loadu_si128((const __m128i *)ref);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+
+ if (height == 64) {
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+ } else if (height == 32) {
+ s0 = _mm_srai_epi16(s0, 4);
+ s1 = _mm_srai_epi16(s1, 4);
+ } else {
+ s0 = _mm_srai_epi16(s0, 3);
+ s1 = _mm_srai_epi16(s1, 3);
+ }
+
+ _mm_storeu_si128((__m128i *)hbuf, s0);
+ hbuf += 8;
+ _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i src_line = _mm_load_si128((const __m128i *)ref);
+ __m128i s0 = _mm_sad_epu8(src_line, zero);
+ __m128i s1;
+ int i;
+
+ for (i = 16; i < width; i += 16) {
+ ref += 16;
+ src_line = _mm_load_si128((const __m128i *)ref);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ return _mm_extract_epi16(s0, 0);
+}
+
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+ const int bwl) {
+ int idx;
+ int width = 4 << bwl;
+ int16_t mean;
+ __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+ __m128i v1 = _mm_load_si128((const __m128i *)src);
+ __m128i diff = _mm_subs_epi16(v0, v1);
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
+
+ ref += 8;
+ src += 8;
+
+ for (idx = 8; idx < width; idx += 8) {
+ v0 = _mm_loadu_si128((const __m128i *)ref);
+ v1 = _mm_load_si128((const __m128i *)src);
+ diff = _mm_subs_epi16(v0, v1);
+
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
+
+ ref += 8;
+ src += 8;
+ }
+
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
+
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = _mm_extract_epi16(sum, 0);
+
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
index 9ea22fed2b7..66827ad8037 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
@@ -13,13 +13,14 @@
#include "vpx_ports/mem.h"
#define pair256_set_epi16(a, b) \
- _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define pair256_set_epi32(a, b) \
- _mm256_set_epi32(b, a, b, a, b, a, b, a)
-
-
-
+ _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
+ (int)(b), (int)(a), (int)(b), (int)(a))
#if FDCT32x32_HIGH_PRECISION
static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
@@ -50,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 42fdbbdc5ce..099993aa6a0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -10,30 +10,50 @@
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vp9/encoder/x86/vp9_dct_sse2.h"
+#include "vp9/encoder/vp9_dct.h"
#include "vpx_ports/mem.h"
-#define pair_set_epi32(a, b) \
- _mm_set_epi32(b, a, b, a)
-
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
#if FDCT32x32_HIGH_PRECISION
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
- __m128i buf0, buf1;
- buf0 = _mm_mul_epu32(a, b);
- a = _mm_srli_epi64(a, 32);
- b = _mm_srli_epi64(b, 32);
- buf1 = _mm_mul_epu32(a, b);
- return _mm_add_epi64(buf0, buf1);
+void vp9_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = intermediate[j * 32 + i];
+ vp9_fdct32(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ }
}
-
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
- __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
- __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
- return _mm_unpacklo_epi64(buf0, buf1);
+ #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_c
+ #define HIGH_FDCT32x32_2D_ROWS_C vp9_fdct32x32_rows_c
+#else
+void vp9_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+ int i, j;
+ for (i = 0; i < 32; ++i) {
+ tran_high_t temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = intermediate[j * 32 + i];
+ vp9_fdct32(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] = temp_out[j];
+ }
}
-#endif
+ #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_rd_c
+ #define HIGH_FDCT32x32_2D_ROWS_C vp9_fdct32x32_rd_rows_c
+#endif // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif // DCT_HIGH_BIT_DEPTH
+
void FDCT32x32_2D(const int16_t *input,
- int16_t *output_org, int stride) {
+ tran_low_t *output_org, int stride) {
// Calculate pre-multiplied strides
const int str1 = stride;
const int str2 = 2 * stride;
@@ -44,7 +64,7 @@ void FDCT32x32_2D(const int16_t *input,
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
@@ -84,6 +104,9 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i kOne = _mm_set1_epi16(1);
// Do the two transform/transpose passes
int pass;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
for (pass = 0; pass < 2; ++pass) {
// We process eight columns (transposed rows in second pass) at a time.
int column_start;
@@ -237,14 +260,23 @@ void FDCT32x32_2D(const int16_t *input,
__m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
__m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
__m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
- step1[ 0] = _mm_add_epi16(in00, in31);
- step1[ 1] = _mm_add_epi16(in01, in30);
- step1[ 2] = _mm_add_epi16(in02, in29);
- step1[ 3] = _mm_add_epi16(in03, in28);
- step1[28] = _mm_sub_epi16(in03, in28);
- step1[29] = _mm_sub_epi16(in02, in29);
- step1[30] = _mm_sub_epi16(in01, in30);
- step1[31] = _mm_sub_epi16(in00, in31);
+ step1[0] = ADD_EPI16(in00, in31);
+ step1[1] = ADD_EPI16(in01, in30);
+ step1[2] = ADD_EPI16(in02, in29);
+ step1[3] = ADD_EPI16(in03, in28);
+ step1[28] = SUB_EPI16(in03, in28);
+ step1[29] = SUB_EPI16(in02, in29);
+ step1[30] = SUB_EPI16(in01, in30);
+ step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+ &step1[3], &step1[28], &step1[29],
+ &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
__m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
@@ -255,14 +287,23 @@ void FDCT32x32_2D(const int16_t *input,
__m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
__m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
__m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
- step1[ 4] = _mm_add_epi16(in04, in27);
- step1[ 5] = _mm_add_epi16(in05, in26);
- step1[ 6] = _mm_add_epi16(in06, in25);
- step1[ 7] = _mm_add_epi16(in07, in24);
- step1[24] = _mm_sub_epi16(in07, in24);
- step1[25] = _mm_sub_epi16(in06, in25);
- step1[26] = _mm_sub_epi16(in05, in26);
- step1[27] = _mm_sub_epi16(in04, in27);
+ step1[4] = ADD_EPI16(in04, in27);
+ step1[5] = ADD_EPI16(in05, in26);
+ step1[6] = ADD_EPI16(in06, in25);
+ step1[7] = ADD_EPI16(in07, in24);
+ step1[24] = SUB_EPI16(in07, in24);
+ step1[25] = SUB_EPI16(in06, in25);
+ step1[26] = SUB_EPI16(in05, in26);
+ step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+ &step1[7], &step1[24], &step1[25],
+ &step1[26], &step1[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
__m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
@@ -273,14 +314,23 @@ void FDCT32x32_2D(const int16_t *input,
__m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
__m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
__m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
- step1[ 8] = _mm_add_epi16(in08, in23);
- step1[ 9] = _mm_add_epi16(in09, in22);
- step1[10] = _mm_add_epi16(in10, in21);
- step1[11] = _mm_add_epi16(in11, in20);
- step1[20] = _mm_sub_epi16(in11, in20);
- step1[21] = _mm_sub_epi16(in10, in21);
- step1[22] = _mm_sub_epi16(in09, in22);
- step1[23] = _mm_sub_epi16(in08, in23);
+ step1[8] = ADD_EPI16(in08, in23);
+ step1[9] = ADD_EPI16(in09, in22);
+ step1[10] = ADD_EPI16(in10, in21);
+ step1[11] = ADD_EPI16(in11, in20);
+ step1[20] = SUB_EPI16(in11, in20);
+ step1[21] = SUB_EPI16(in10, in21);
+ step1[22] = SUB_EPI16(in09, in22);
+ step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+ &step1[11], &step1[20], &step1[21],
+ &step1[22], &step1[23]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
__m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
@@ -291,34 +341,57 @@ void FDCT32x32_2D(const int16_t *input,
__m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
__m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
__m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
- step1[12] = _mm_add_epi16(in12, in19);
- step1[13] = _mm_add_epi16(in13, in18);
- step1[14] = _mm_add_epi16(in14, in17);
- step1[15] = _mm_add_epi16(in15, in16);
- step1[16] = _mm_sub_epi16(in15, in16);
- step1[17] = _mm_sub_epi16(in14, in17);
- step1[18] = _mm_sub_epi16(in13, in18);
- step1[19] = _mm_sub_epi16(in12, in19);
+ step1[12] = ADD_EPI16(in12, in19);
+ step1[13] = ADD_EPI16(in13, in18);
+ step1[14] = ADD_EPI16(in14, in17);
+ step1[15] = ADD_EPI16(in15, in16);
+ step1[16] = SUB_EPI16(in15, in16);
+ step1[17] = SUB_EPI16(in14, in17);
+ step1[18] = SUB_EPI16(in13, in18);
+ step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+ &step1[15], &step1[16], &step1[17],
+ &step1[18], &step1[19]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
}
// Stage 2
{
- step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
- step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
- step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
- step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
- step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
- step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
- step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
- step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
- step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
- step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
- step2[10] = _mm_sub_epi16(step1[5], step1[10]);
- step2[11] = _mm_sub_epi16(step1[4], step1[11]);
- step2[12] = _mm_sub_epi16(step1[3], step1[12]);
- step2[13] = _mm_sub_epi16(step1[2], step1[13]);
- step2[14] = _mm_sub_epi16(step1[1], step1[14]);
- step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+ step2[0] = ADD_EPI16(step1[0], step1[15]);
+ step2[1] = ADD_EPI16(step1[1], step1[14]);
+ step2[2] = ADD_EPI16(step1[2], step1[13]);
+ step2[3] = ADD_EPI16(step1[3], step1[12]);
+ step2[4] = ADD_EPI16(step1[4], step1[11]);
+ step2[5] = ADD_EPI16(step1[5], step1[10]);
+ step2[6] = ADD_EPI16(step1[6], step1[ 9]);
+ step2[7] = ADD_EPI16(step1[7], step1[ 8]);
+ step2[8] = SUB_EPI16(step1[7], step1[ 8]);
+ step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+ step2[10] = SUB_EPI16(step1[5], step1[10]);
+ step2[11] = SUB_EPI16(step1[4], step1[11]);
+ step2[12] = SUB_EPI16(step1[3], step1[12]);
+ step2[13] = SUB_EPI16(step1[2], step1[13]);
+ step2[14] = SUB_EPI16(step1[1], step1[14]);
+ step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[0], &step2[1], &step2[2], &step2[3],
+ &step2[4], &step2[5], &step2[6], &step2[7],
+ &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
@@ -387,6 +460,18 @@ void FDCT32x32_2D(const int16_t *input,
step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+ &step2[23], &step2[24], &step2[25],
+ &step2[26], &step2[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
#if !FDCT32x32_HIGH_PRECISION
@@ -426,49 +511,63 @@ void FDCT32x32_2D(const int16_t *input,
__m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
__m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
- step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
- step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
- step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
- step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
- step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
- step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
- step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
- step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
- step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
- step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
- step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
- step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
- step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
- step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
- step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
- step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
- step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
- step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
- step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
- step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
- step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
- step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
- step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
- step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
- step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
- step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
- step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
- step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
- step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
- step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
- step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
- step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
-
- step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
- step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
- step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
- step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
- step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
- step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
- step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
- step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
- step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
- step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+ step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
+ step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
+ step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
+ step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
+ step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
+ step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
+ step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
+ step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
+ step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
+ step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+ step2[10] = SUB_EPI16(step2[10], s3_10_0);
+ step2[11] = SUB_EPI16(step2[11], s3_11_0);
+ step2[12] = SUB_EPI16(step2[12], s3_12_0);
+ step2[13] = SUB_EPI16(step2[13], s3_13_0);
+ step2[14] = SUB_EPI16(step2[14], s2_14_0);
+ step2[15] = SUB_EPI16(step2[15], s2_15_0);
+ step1[16] = SUB_EPI16(step1[16], s3_16_0);
+ step1[17] = SUB_EPI16(step1[17], s3_17_0);
+ step1[18] = SUB_EPI16(step1[18], s3_18_0);
+ step1[19] = SUB_EPI16(step1[19], s3_19_0);
+ step2[20] = SUB_EPI16(step2[20], s3_20_0);
+ step2[21] = SUB_EPI16(step2[21], s3_21_0);
+ step2[22] = SUB_EPI16(step2[22], s3_22_0);
+ step2[23] = SUB_EPI16(step2[23], s3_23_0);
+ step2[24] = SUB_EPI16(step2[24], s3_24_0);
+ step2[25] = SUB_EPI16(step2[25], s3_25_0);
+ step2[26] = SUB_EPI16(step2[26], s3_26_0);
+ step2[27] = SUB_EPI16(step2[27], s3_27_0);
+ step1[28] = SUB_EPI16(step1[28], s3_28_0);
+ step1[29] = SUB_EPI16(step1[29], s3_29_0);
+ step1[30] = SUB_EPI16(step1[30], s3_30_0);
+ step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x32(
+ &step2[0], &step2[1], &step2[2], &step2[3],
+ &step2[4], &step2[5], &step2[6], &step2[7],
+ &step2[8], &step2[9], &step2[10], &step2[11],
+ &step2[12], &step2[13], &step2[14], &step2[15],
+ &step1[16], &step1[17], &step1[18], &step1[19],
+ &step2[20], &step2[21], &step2[22], &step2[23],
+ &step2[24], &step2[25], &step2[26], &step2[27],
+ &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ step2[0] = _mm_add_epi16(step2[ 0], kOne);
+ step2[1] = _mm_add_epi16(step2[ 1], kOne);
+ step2[2] = _mm_add_epi16(step2[ 2], kOne);
+ step2[3] = _mm_add_epi16(step2[ 3], kOne);
+ step2[4] = _mm_add_epi16(step2[ 4], kOne);
+ step2[5] = _mm_add_epi16(step2[ 5], kOne);
+ step2[6] = _mm_add_epi16(step2[ 6], kOne);
+ step2[7] = _mm_add_epi16(step2[ 7], kOne);
+ step2[8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[9] = _mm_add_epi16(step2[ 9], kOne);
step2[10] = _mm_add_epi16(step2[10], kOne);
step2[11] = _mm_add_epi16(step2[11], kOne);
step2[12] = _mm_add_epi16(step2[12], kOne);
@@ -492,16 +591,16 @@ void FDCT32x32_2D(const int16_t *input,
step1[30] = _mm_add_epi16(step1[30], kOne);
step1[31] = _mm_add_epi16(step1[31], kOne);
- step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
- step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
- step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
- step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
- step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
- step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
- step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
- step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
- step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
- step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+ step2[0] = _mm_srai_epi16(step2[ 0], 2);
+ step2[1] = _mm_srai_epi16(step2[ 1], 2);
+ step2[2] = _mm_srai_epi16(step2[ 2], 2);
+ step2[3] = _mm_srai_epi16(step2[ 3], 2);
+ step2[4] = _mm_srai_epi16(step2[ 4], 2);
+ step2[5] = _mm_srai_epi16(step2[ 5], 2);
+ step2[6] = _mm_srai_epi16(step2[ 6], 2);
+ step2[7] = _mm_srai_epi16(step2[ 7], 2);
+ step2[8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[9] = _mm_srai_epi16(step2[ 9], 2);
step2[10] = _mm_srai_epi16(step2[10], 2);
step2[11] = _mm_srai_epi16(step2[11], 2);
step2[12] = _mm_srai_epi16(step2[12], 2);
@@ -525,21 +624,33 @@ void FDCT32x32_2D(const int16_t *input,
step1[30] = _mm_srai_epi16(step1[30], 2);
step1[31] = _mm_srai_epi16(step1[31], 2);
}
-#endif
+#endif // !FDCT32x32_HIGH_PRECISION
#if FDCT32x32_HIGH_PRECISION
if (pass == 0) {
#endif
// Stage 3
{
- step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
- step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
- step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
- step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
- step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
- step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
- step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
- step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+ step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+ step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+ step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+ step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+ step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+ step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+ step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+ step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+ &step3[3], &step3[4], &step3[5],
+ &step3[6], &step3[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -576,40 +687,79 @@ void FDCT32x32_2D(const int16_t *input,
step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+ &step3[12], &step3[13]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
- step3[16] = _mm_add_epi16(step2[23], step1[16]);
- step3[17] = _mm_add_epi16(step2[22], step1[17]);
- step3[18] = _mm_add_epi16(step2[21], step1[18]);
- step3[19] = _mm_add_epi16(step2[20], step1[19]);
- step3[20] = _mm_sub_epi16(step1[19], step2[20]);
- step3[21] = _mm_sub_epi16(step1[18], step2[21]);
- step3[22] = _mm_sub_epi16(step1[17], step2[22]);
- step3[23] = _mm_sub_epi16(step1[16], step2[23]);
- step3[24] = _mm_sub_epi16(step1[31], step2[24]);
- step3[25] = _mm_sub_epi16(step1[30], step2[25]);
- step3[26] = _mm_sub_epi16(step1[29], step2[26]);
- step3[27] = _mm_sub_epi16(step1[28], step2[27]);
- step3[28] = _mm_add_epi16(step2[27], step1[28]);
- step3[29] = _mm_add_epi16(step2[26], step1[29]);
- step3[30] = _mm_add_epi16(step2[25], step1[30]);
- step3[31] = _mm_add_epi16(step2[24], step1[31]);
+ step3[16] = ADD_EPI16(step2[23], step1[16]);
+ step3[17] = ADD_EPI16(step2[22], step1[17]);
+ step3[18] = ADD_EPI16(step2[21], step1[18]);
+ step3[19] = ADD_EPI16(step2[20], step1[19]);
+ step3[20] = SUB_EPI16(step1[19], step2[20]);
+ step3[21] = SUB_EPI16(step1[18], step2[21]);
+ step3[22] = SUB_EPI16(step1[17], step2[22]);
+ step3[23] = SUB_EPI16(step1[16], step2[23]);
+ step3[24] = SUB_EPI16(step1[31], step2[24]);
+ step3[25] = SUB_EPI16(step1[30], step2[25]);
+ step3[26] = SUB_EPI16(step1[29], step2[26]);
+ step3[27] = SUB_EPI16(step1[28], step2[27]);
+ step3[28] = ADD_EPI16(step2[27], step1[28]);
+ step3[29] = ADD_EPI16(step2[26], step1[29]);
+ step3[30] = ADD_EPI16(step2[25], step1[30]);
+ step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step3[16], &step3[17], &step3[18], &step3[19],
+ &step3[20], &step3[21], &step3[22], &step3[23],
+ &step3[24], &step3[25], &step3[26], &step3[27],
+ &step3[28], &step3[29], &step3[30], &step3[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
// Stage 4
{
- step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
- step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
- step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
- step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
- step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
- step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
- step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
- step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
- step1[12] = _mm_sub_epi16(step2[15], step3[12]);
- step1[13] = _mm_sub_epi16(step2[14], step3[13]);
- step1[14] = _mm_add_epi16(step3[13], step2[14]);
- step1[15] = _mm_add_epi16(step3[12], step2[15]);
+ step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
+ step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
+ step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
+ step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
+ step1[8] = ADD_EPI16(step3[11], step2[ 8]);
+ step1[9] = ADD_EPI16(step3[10], step2[ 9]);
+ step1[10] = SUB_EPI16(step2[ 9], step3[10]);
+ step1[11] = SUB_EPI16(step2[ 8], step3[11]);
+ step1[12] = SUB_EPI16(step2[15], step3[12]);
+ step1[13] = SUB_EPI16(step2[14], step3[13]);
+ step1[14] = ADD_EPI16(step3[13], step2[14]);
+ step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[0], &step1[1], &step1[2], &step1[3],
+ &step1[4], &step1[5], &step1[6], &step1[7],
+ &step1[8], &step1[9], &step1[10], &step1[11],
+ &step1[12], &step1[13], &step1[14], &step1[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
@@ -630,6 +780,16 @@ void FDCT32x32_2D(const int16_t *input,
// Combine
step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
@@ -698,13 +858,36 @@ void FDCT32x32_2D(const int16_t *input,
step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+ &step1[21], &step1[26], &step1[27],
+ &step1[28], &step1[29]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
// Stage 5
{
- step2[4] = _mm_add_epi16(step1[5], step3[4]);
- step2[5] = _mm_sub_epi16(step3[4], step1[5]);
- step2[6] = _mm_sub_epi16(step3[7], step1[6]);
- step2[7] = _mm_add_epi16(step1[6], step3[7]);
+ step2[4] = ADD_EPI16(step1[5], step3[4]);
+ step2[5] = SUB_EPI16(step3[4], step1[5]);
+ step2[6] = SUB_EPI16(step3[7], step1[6]);
+ step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+ &step2[6], &step2[7]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
@@ -741,6 +924,17 @@ void FDCT32x32_2D(const int16_t *input,
out[16] = _mm_packs_epi32(out_16_6, out_16_7);
out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[0], &out[16],
+ &out[8], &out[24]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
@@ -777,24 +971,49 @@ void FDCT32x32_2D(const int16_t *input,
step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+ &step2[13], &step2[14]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
- step2[16] = _mm_add_epi16(step1[19], step3[16]);
- step2[17] = _mm_add_epi16(step1[18], step3[17]);
- step2[18] = _mm_sub_epi16(step3[17], step1[18]);
- step2[19] = _mm_sub_epi16(step3[16], step1[19]);
- step2[20] = _mm_sub_epi16(step3[23], step1[20]);
- step2[21] = _mm_sub_epi16(step3[22], step1[21]);
- step2[22] = _mm_add_epi16(step1[21], step3[22]);
- step2[23] = _mm_add_epi16(step1[20], step3[23]);
- step2[24] = _mm_add_epi16(step1[27], step3[24]);
- step2[25] = _mm_add_epi16(step1[26], step3[25]);
- step2[26] = _mm_sub_epi16(step3[25], step1[26]);
- step2[27] = _mm_sub_epi16(step3[24], step1[27]);
- step2[28] = _mm_sub_epi16(step3[31], step1[28]);
- step2[29] = _mm_sub_epi16(step3[30], step1[29]);
- step2[30] = _mm_add_epi16(step1[29], step3[30]);
- step2[31] = _mm_add_epi16(step1[28], step3[31]);
+ step2[16] = ADD_EPI16(step1[19], step3[16]);
+ step2[17] = ADD_EPI16(step1[18], step3[17]);
+ step2[18] = SUB_EPI16(step3[17], step1[18]);
+ step2[19] = SUB_EPI16(step3[16], step1[19]);
+ step2[20] = SUB_EPI16(step3[23], step1[20]);
+ step2[21] = SUB_EPI16(step3[22], step1[21]);
+ step2[22] = ADD_EPI16(step1[21], step3[22]);
+ step2[23] = ADD_EPI16(step1[20], step3[23]);
+ step2[24] = ADD_EPI16(step1[27], step3[24]);
+ step2[25] = ADD_EPI16(step1[26], step3[25]);
+ step2[26] = SUB_EPI16(step3[25], step1[26]);
+ step2[27] = SUB_EPI16(step3[24], step1[27]);
+ step2[28] = SUB_EPI16(step3[31], step1[28]);
+ step2[29] = SUB_EPI16(step3[30], step1[29]);
+ step2[30] = ADD_EPI16(step1[29], step3[30]);
+ step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step2[16], &step2[17], &step2[18], &step2[19],
+ &step2[20], &step2[21], &step2[22], &step2[23],
+ &step2[24], &step2[25], &step2[26], &step2[27],
+ &step2[28], &step2[29], &step2[30], &step2[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
// Stage 6
{
@@ -832,20 +1051,43 @@ void FDCT32x32_2D(const int16_t *input,
const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
// Combine
- out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[4] = _mm_packs_epi32(out_04_6, out_04_7);
out[20] = _mm_packs_epi32(out_20_6, out_20_7);
out[12] = _mm_packs_epi32(out_12_6, out_12_7);
out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[4], &out[20],
+ &out[12], &out[28]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
- step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
- step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
- step3[10] = _mm_sub_epi16(step1[11], step2[10]);
- step3[11] = _mm_add_epi16(step2[10], step1[11]);
- step3[12] = _mm_add_epi16(step2[13], step1[12]);
- step3[13] = _mm_sub_epi16(step1[12], step2[13]);
- step3[14] = _mm_sub_epi16(step1[15], step2[14]);
- step3[15] = _mm_add_epi16(step2[14], step1[15]);
+ step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
+ step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
+ step3[10] = SUB_EPI16(step1[11], step2[10]);
+ step3[11] = ADD_EPI16(step2[10], step1[11]);
+ step3[12] = ADD_EPI16(step2[13], step1[12]);
+ step3[13] = SUB_EPI16(step1[12], step2[13]);
+ step3[14] = SUB_EPI16(step1[15], step2[14]);
+ step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+ &step3[11], &step3[12], &step3[13],
+ &step3[14], &step3[15]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
@@ -915,6 +1157,18 @@ void FDCT32x32_2D(const int16_t *input,
step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+ &step3[22], &step3[25], &step3[26],
+ &step3[29], &step3[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
// Stage 7
{
@@ -984,24 +1238,50 @@ void FDCT32x32_2D(const int16_t *input,
out[22] = _mm_packs_epi32(out_22_6, out_22_7);
out[14] = _mm_packs_epi32(out_14_6, out_14_7);
out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+ &out[26], &out[6], &out[22],
+ &out[14], &out[30]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
- step1[16] = _mm_add_epi16(step3[17], step2[16]);
- step1[17] = _mm_sub_epi16(step2[16], step3[17]);
- step1[18] = _mm_sub_epi16(step2[19], step3[18]);
- step1[19] = _mm_add_epi16(step3[18], step2[19]);
- step1[20] = _mm_add_epi16(step3[21], step2[20]);
- step1[21] = _mm_sub_epi16(step2[20], step3[21]);
- step1[22] = _mm_sub_epi16(step2[23], step3[22]);
- step1[23] = _mm_add_epi16(step3[22], step2[23]);
- step1[24] = _mm_add_epi16(step3[25], step2[24]);
- step1[25] = _mm_sub_epi16(step2[24], step3[25]);
- step1[26] = _mm_sub_epi16(step2[27], step3[26]);
- step1[27] = _mm_add_epi16(step3[26], step2[27]);
- step1[28] = _mm_add_epi16(step3[29], step2[28]);
- step1[29] = _mm_sub_epi16(step2[28], step3[29]);
- step1[30] = _mm_sub_epi16(step2[31], step3[30]);
- step1[31] = _mm_add_epi16(step3[30], step2[31]);
+ step1[16] = ADD_EPI16(step3[17], step2[16]);
+ step1[17] = SUB_EPI16(step2[16], step3[17]);
+ step1[18] = SUB_EPI16(step2[19], step3[18]);
+ step1[19] = ADD_EPI16(step3[18], step2[19]);
+ step1[20] = ADD_EPI16(step3[21], step2[20]);
+ step1[21] = SUB_EPI16(step2[20], step3[21]);
+ step1[22] = SUB_EPI16(step2[23], step3[22]);
+ step1[23] = ADD_EPI16(step3[22], step2[23]);
+ step1[24] = ADD_EPI16(step3[25], step2[24]);
+ step1[25] = SUB_EPI16(step2[24], step3[25]);
+ step1[26] = SUB_EPI16(step2[27], step3[26]);
+ step1[27] = ADD_EPI16(step3[26], step2[27]);
+ step1[28] = ADD_EPI16(step3[29], step2[28]);
+ step1[29] = SUB_EPI16(step2[28], step3[29]);
+ step1[30] = SUB_EPI16(step2[31], step3[30]);
+ step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x16(
+ &step1[16], &step1[17], &step1[18], &step1[19],
+ &step1[20], &step1[21], &step1[22], &step1[23],
+ &step1[24], &step1[25], &step1[26], &step1[27],
+ &step1[28], &step1[29], &step1[30], &step1[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
// Final stage --- outputs indices are bit-reversed.
{
@@ -1071,6 +1351,18 @@ void FDCT32x32_2D(const int16_t *input,
out[23] = _mm_packs_epi32(out_23_6, out_23_7);
out[15] = _mm_packs_epi32(out_15_6, out_15_7);
out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+ &out[25], &out[7], &out[23],
+ &out[15], &out[31]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
@@ -1139,6 +1431,18 @@ void FDCT32x32_2D(const int16_t *input,
out[19] = _mm_packs_epi32(out_19_6, out_19_7);
out[11] = _mm_packs_epi32(out_11_6, out_11_7);
out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+ &out[29], &out[3], &out[19],
+ &out[11], &out[27]);
+ if (overflow) {
+ if (pass == 0)
+ HIGH_FDCT32x32_2D_C(input, output_org, stride);
+ else
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
#if FDCT32x32_HIGH_PRECISION
} else {
@@ -1390,15 +1694,22 @@ void FDCT32x32_2D(const int16_t *input,
// TODO(jingning): manually inline k_madd_epi32_ to further hide
// instruction latency.
- v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
- v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
- v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
- v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
- v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
- v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
- v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
- v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
-
+ v[0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+ &v[4], &v[5], &v[6], &v[7], &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[0] = k_packs_epi64(v[0], v[1]);
u[1] = k_packs_epi64(v[2], v[3]);
u[2] = k_packs_epi64(v[4], v[5]);
@@ -1469,6 +1780,18 @@ void FDCT32x32_2D(const int16_t *input,
v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -1565,6 +1888,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = k_madd_epi32(u[6], k32_m08_p24);
v[15] = k_madd_epi32(u[7], k32_m08_p24);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[0] = k_packs_epi64(v[0], v[1]);
u[1] = k_packs_epi64(v[2], v[3]);
u[2] = k_packs_epi64(v[4], v[5]);
@@ -1633,6 +1966,14 @@ void FDCT32x32_2D(const int16_t *input,
out[16] = _mm_packs_epi32(u[2], u[3]);
out[ 8] = _mm_packs_epi32(u[4], u[5]);
out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[0], &out[16],
+ &out[8], &out[24]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -1665,6 +2006,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = k_madd_epi32(u[2], k32_p24_p08);
v[15] = k_madd_epi32(u[3], k32_p24_p08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[0] = k_packs_epi64(v[0], v[1]);
u[1] = k_packs_epi64(v[2], v[3]);
u[2] = k_packs_epi64(v[4], v[5]);
@@ -1767,6 +2118,16 @@ void FDCT32x32_2D(const int16_t *input,
v[14] = k_madd_epi32(u[14], k32_m04_p28);
v[15] = k_madd_epi32(u[15], k32_m04_p28);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_16(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[0] = k_packs_epi64(v[0], v[1]);
u[1] = k_packs_epi64(v[2], v[3]);
u[2] = k_packs_epi64(v[4], v[5]);
@@ -1834,6 +2195,14 @@ void FDCT32x32_2D(const int16_t *input,
out[20] = _mm_packs_epi32(u[2], u[3]);
out[12] = _mm_packs_epi32(u[4], u[5]);
out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&out[4], &out[20],
+ &out[12], &out[28]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
@@ -1912,6 +2281,18 @@ void FDCT32x32_2D(const int16_t *input,
v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2024,6 +2405,18 @@ void FDCT32x32_2D(const int16_t *input,
v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2151,6 +2544,15 @@ void FDCT32x32_2D(const int16_t *input,
out[22] = _mm_packs_epi32(u[10], u[11]);
out[14] = _mm_packs_epi32(u[12], u[13]);
out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+ &out[26], &out[6], &out[22],
+ &out[14], &out[30]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
@@ -2247,6 +2649,18 @@ void FDCT32x32_2D(const int16_t *input,
v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2374,6 +2788,15 @@ void FDCT32x32_2D(const int16_t *input,
out[23] = _mm_packs_epi32(u[10], u[11]);
out[15] = _mm_packs_epi32(u[12], u[13]);
out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+ &out[25], &out[7], &out[23],
+ &out[15], &out[31]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
{
const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
@@ -2435,6 +2858,18 @@ void FDCT32x32_2D(const int16_t *input,
v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = k_check_epi32_overflow_32(
+ &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+ &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+ &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+ &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+ &kZero);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2562,18 +2997,23 @@ void FDCT32x32_2D(const int16_t *input,
out[19] = _mm_packs_epi32(u[10], u[11]);
out[11] = _mm_packs_epi32(u[12], u[13]);
out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+ &out[29], &out[3], &out[19],
+ &out[11], &out[27]);
+ if (overflow) {
+ HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
}
}
-#endif
+#endif // FDCT32x32_HIGH_PRECISION
// Transpose the results, do it as four 8x8 transposes.
{
int transpose_block;
- int16_t *output;
- if (0 == pass) {
- output = &intermediate[column_start * 32];
- } else {
- output = &output_org[column_start * 32];
- }
+ int16_t *output0 = &intermediate[column_start * 32];
+ tran_low_t *output1 = &output_org[column_start * 32];
for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
__m128i *this_out = &out[8 * transpose_block];
// 00 01 02 03 04 05 06 07
@@ -2674,18 +3114,36 @@ void FDCT32x32_2D(const int16_t *input,
}
// Note: even though all these stores are aligned, using the aligned
// intrinsic make the code slightly slower.
- _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
- _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
- _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
- _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
- _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
- _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
- _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
- _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
- // Process next 8x8
- output += 8;
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+ // Process next 8x8
+ output0 += 8;
+ } else {
+ storeu_output(&tr2_0, (output1 + 0 * 32));
+ storeu_output(&tr2_1, (output1 + 1 * 32));
+ storeu_output(&tr2_2, (output1 + 2 * 32));
+ storeu_output(&tr2_3, (output1 + 3 * 32));
+ storeu_output(&tr2_4, (output1 + 4 * 32));
+ storeu_output(&tr2_5, (output1 + 5 * 32));
+ storeu_output(&tr2_6, (output1 + 6 * 32));
+ storeu_output(&tr2_7, (output1 + 7 * 32));
+ // Process next 8x8
+ output1 += 8;
+ }
}
}
}
}
} // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c
new file mode 100644
index 00000000000..e03a76d2e89
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/x86/vp9_dct_sse2.h"
+#include "vpx_ports/mem.h"
+
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+ cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64,
+ cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+ cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64,
+ cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+ cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64,
+ -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+ cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64,
+ -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+ +(DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+ __m128i cmp0, cmp1;
+ int test, overflow;
+#endif
+
+ // Load inputs.
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+ (input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+ (input + 3 * stride)));
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+ // Check inputs small enough to use optimised code
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+ _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+ test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+ if (test) {
+ vp9_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // multiply by 16 to give some extra precision
+ in0 = _mm_slli_epi16(in0, 4);
+ in1 = _mm_slli_epi16(in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+ in0 = _mm_add_epi16(in0, mask);
+ in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+ const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vp9_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ in0 = _mm_shuffle_epi32(x0, 0xD8);
+ in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ const __m128i t0 = ADD_EPI16(in0, in1);
+ const __m128i t1 = SUB_EPI16(in0, in1);
+ // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
+ // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&t0, &t1);
+ if (overflow) {
+ vp9_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ // w0 = [o0 o4 o8 oC]
+ // w1 = [o2 o6 oA oE]
+ // w2 = [o1 o5 o9 oD]
+ // w3 = [o3 o7 oB oF]
+ // remember the o's are numbered according to the correct output location
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&x0, &x1);
+ if (overflow) {
+ vp9_highbd_fdct4x4_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+ // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+ const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+ const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+ // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+ // y1 = [o2 o3 o6 o7 oA oB oE oF]
+ in0 = _mm_unpacklo_epi32(y0, y1);
+ // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+ in1 = _mm_unpackhi_epi32(y0, y1);
+ // in1 = [o8 o9 oA oB oC oD oE oF]
+ }
+ }
+ }
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+ &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vp9_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vp9_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ vp9_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vp9_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vp9_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ vp9_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we transpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+ const int16_t *in = input;
+ int16_t *out0 = intermediate;
+ tran_low_t *out1 = output;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = ADD_EPI16(in00, in15);
+ input1 = ADD_EPI16(in01, in14);
+ input2 = ADD_EPI16(in02, in13);
+ input3 = ADD_EPI16(in03, in12);
+ input4 = ADD_EPI16(in04, in11);
+ input5 = ADD_EPI16(in05, in10);
+ input6 = ADD_EPI16(in06, in09);
+ input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+ &input4, &input5, &input6, &input7);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = SUB_EPI16(in07, in08);
+ step1_1 = SUB_EPI16(in06, in09);
+ step1_2 = SUB_EPI16(in05, in10);
+ step1_3 = SUB_EPI16(in04, in11);
+ step1_4 = SUB_EPI16(in03, in12);
+ step1_5 = SUB_EPI16(in02, in13);
+ step1_6 = SUB_EPI16(in01, in14);
+ step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+ &step1_2, &step1_3,
+ &step1_4, &step1_5,
+ &step1_6, &step1_7);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // Work on the first eight values; fdct8(input, even_results);
+ {
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(input0, input7);
+ const __m128i q1 = ADD_EPI16(input1, input6);
+ const __m128i q2 = ADD_EPI16(input2, input5);
+ const __m128i q3 = ADD_EPI16(input3, input4);
+ const __m128i q4 = SUB_EPI16(input3, input4);
+ const __m128i q5 = SUB_EPI16(input2, input5);
+ const __m128i q6 = SUB_EPI16(input1, input6);
+ const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+ &q4, &q5, &q6, &q7);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING,
+ DCT_CONST_BITS);
+ const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING,
+ DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res02, &res14,
+ &res10, &res06);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+ &step2_4);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 3
+ {
+ step3_0 = ADD_EPI16(step1_0, step2_3);
+ step3_1 = ADD_EPI16(step1_1, step2_2);
+ step3_2 = SUB_EPI16(step1_1, step2_2);
+ step3_3 = SUB_EPI16(step1_0, step2_3);
+ step3_4 = SUB_EPI16(step1_7, step2_4);
+ step3_5 = SUB_EPI16(step1_6, step2_5);
+ step3_6 = ADD_EPI16(step1_6, step2_5);
+ step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+ &step3_2, &step3_3,
+ &step3_4, &step3_5,
+ &step3_6, &step3_7);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+ &step2_5);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 5
+ {
+ step1_0 = ADD_EPI16(step3_0, step2_1);
+ step1_1 = SUB_EPI16(step3_0, step2_1);
+ step1_2 = ADD_EPI16(step3_3, step2_2);
+ step1_3 = SUB_EPI16(step3_3, step2_2);
+ step1_4 = SUB_EPI16(step3_4, step2_5);
+ step1_5 = ADD_EPI16(step3_4, step2_5);
+ step1_6 = SUB_EPI16(step3_7, step2_6);
+ step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+ &step1_2, &step1_3,
+ &step1_4, &step1_5,
+ &step1_6, &step1_7);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+ res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+ &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+ if (overflow) {
+ vp9_highbd_fdct16x16_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ transpose_and_output8x8(&res00, &res01, &res02, &res03,
+ &res04, &res05, &res06, &res07,
+ pass, out0, out1);
+ transpose_and_output8x8(&res08, &res09, &res10, &res11,
+ &res12, &res13, &res14, &res15,
+ pass, out0 + 8, out1 + 8);
+ if (pass == 0) {
+ out0 += 8*16;
+ } else {
+ out1 += 8*16;
+ }
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
index f71181c5e91..b41fbc8b3bb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
psllw m2, 2
psllw m3, 2
+%if CONFIG_VP9_HIGHBITDEPTH
+ pxor m4, m4
+ pxor m5, m5
+ pcmpgtw m4, m0
+ pcmpgtw m5, m1
+ movq m6, m0
+ movq m7, m1
+ punpcklwd m0, m4
+ punpcklwd m1, m5
+ punpckhwd m6, m4
+ punpckhwd m7, m5
+ movq [outputq], m0
+ movq [outputq + 8], m6
+ movq [outputq + 16], m1
+ movq [outputq + 24], m7
+ pxor m4, m4
+ pxor m5, m5
+ pcmpgtw m4, m2
+ pcmpgtw m5, m3
+ movq m6, m2
+ movq m7, m3
+ punpcklwd m2, m4
+ punpcklwd m3, m5
+ punpckhwd m6, m4
+ punpckhwd m7, m5
+ movq [outputq + 32], m2
+ movq [outputq + 40], m6
+ movq [outputq + 48], m3
+ movq [outputq + 56], m7
+%else
movq [outputq], m0
movq [outputq + 8], m1
movq [outputq + 16], m2
movq [outputq + 24], m3
+%endif
RET
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index e799951c2b3..564b7955e5b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -8,13 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/x86/vp9_dct_sse2.h"
#include "vpx_ports/mem.h"
-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-
-void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
__m128i in0, in1;
__m128i tmp;
const __m128i zero = _mm_setzero_si128();
@@ -40,209 +41,9 @@ void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
in1 = _mm_add_epi32(tmp, in0);
in0 = _mm_slli_epi32(in1, 1);
- _mm_store_si128((__m128i *)(output), in0);
-}
-
-void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
- // This 2D transform implements 4 vertical 1D transforms followed
- // by 4 horizontal 1D transforms. The multiplies and adds are as given
- // by Chen, Smith and Fralick ('77). The commands for moving the data
- // around have been minimized by hand.
- // For the purposes of the comments, the 16 inputs are referred to at i0
- // through iF (in raster order), intermediate variables are a0, b0, c0
- // through f, and correspond to the in-place computations mapped to input
- // locations. The outputs, o0 through oF are labeled according to the
- // output locations.
-
- // Constants
- // These are the coefficients used for the multiplies.
- // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
- // where cospi_N_64 = cos(N pi /64)
- const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64);
- const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
- cospi_8_64, cospi_24_64,
- cospi_24_64, -cospi_8_64,
- cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
- cospi_24_64, -cospi_8_64,
- cospi_8_64, cospi_24_64,
- cospi_8_64, cospi_24_64);
- const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64);
- const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
- cospi_8_64, cospi_24_64,
- -cospi_8_64, -cospi_24_64,
- -cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
- cospi_24_64, -cospi_8_64,
- -cospi_24_64, cospi_8_64,
- -cospi_24_64, cospi_8_64);
-
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // This second rounding constant saves doing some extra adds at the end
- const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
- +(DCT_CONST_ROUNDING << 1));
- const int DCT_CONST_BITS2 = DCT_CONST_BITS+2;
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i in0, in1;
-
- // Load inputs.
- {
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
- (input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
- (input + 3 * stride)));
- // in0 = [i0 i1 i2 i3 iC iD iE iF]
- // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-
-
- // multiply by 16 to give some extra precision
- in0 = _mm_slli_epi16(in0, 4);
- in1 = _mm_slli_epi16(in1, 4);
- // if (i == 0 && input[0]) input[0] += 1;
- // add 1 to the upper left pixel if it is non-zero, which helps reduce
- // the round-trip error
- {
- // The mask will only contain whether the first value is zero, all
- // other comparison will fail as something shifted by 4 (above << 4)
- // can never be equal to one. To increment in the non-zero case, we
- // add the mask and one for the first element:
- // - if zero, mask = -1, v = v - 1 + 1 = v
- // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
- __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
- in0 = _mm_add_epi16(in0, mask);
- in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
- }
- }
- // There are 4 total stages, alternating between an add/subtract stage
- // followed by an multiply-and-add stage.
- {
- // Stage 1: Add/subtract
-
- // in0 = [i0 i1 i2 i3 iC iD iE iF]
- // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
- const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
- const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
- // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
- // r1 = [iC i8 iD i9 iE iA iF iB]
- const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
- const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
- // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
- // r3 = [iC i8 iD i9 iF iB iE iA]
-
- const __m128i t0 = _mm_add_epi16(r2, r3);
- const __m128i t1 = _mm_sub_epi16(r2, r3);
- // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
- // t1 = [aC a8 aD a9 aF aB aE aA]
-
- // Stage 2: multiply by constants (which gets us into 32 bits).
- // The constants needed here are:
- // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
- // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
- // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
- // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
- // Then add and right-shift to get back to 16-bit range
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // w0 = [b0 b1 b7 b6]
- // w1 = [b8 b9 bF bE]
- // w2 = [b4 b5 b3 b2]
- // w3 = [bC bD bB bA]
- const __m128i x0 = _mm_packs_epi32(w0, w1);
- const __m128i x1 = _mm_packs_epi32(w2, w3);
- // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
- // x1 = [b4 b5 b3 b2 bC bD bB bA]
- in0 = _mm_shuffle_epi32(x0, 0xD8);
- in1 = _mm_shuffle_epi32(x1, 0x8D);
- // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
- // in1 = [b3 b2 bB bA b4 b5 bC bD]
- }
- {
- // vertical DCTs finished. Now we do the horizontal DCTs.
- // Stage 3: Add/subtract
-
- const __m128i t0 = _mm_add_epi16(in0, in1);
- const __m128i t1 = _mm_sub_epi16(in0, in1);
- // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
- // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-
- // Stage 4: multiply by constants (which gets us into 32 bits).
- // The constants needed here are:
- // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
- // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
- // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
- // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
- const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
- const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
- // Then add and right-shift to get back to 16-bit range
- // but this combines the final right-shift as well to save operations
- // This unusual rounding operations is to maintain bit-accurate
- // compatibility with the c version of this function which has two
- // rounding steps in a row.
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
- // w0 = [o0 o4 o8 oC]
- // w1 = [o2 o6 oA oE]
- // w2 = [o1 o5 o9 oD]
- // w3 = [o3 o7 oB oF]
- // remember the o's are numbered according to the correct output location
- const __m128i x0 = _mm_packs_epi32(w0, w1);
- const __m128i x1 = _mm_packs_epi32(w2, w3);
- // x0 = [o0 o4 o8 oC o2 o6 oA oE]
- // x1 = [o1 o5 o9 oD o3 o7 oB oF]
- const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
- const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
- // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
- // y1 = [o2 o3 o6 o7 oA oB oE oF]
- in0 = _mm_unpacklo_epi32(y0, y1);
- // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
- in1 = _mm_unpackhi_epi32(y0, y1);
- // in1 = [o8 o9 oA oB oC oD oE oF]
- }
- // Post-condition (v + 1) >> 2 is now incorporated into previous
- // add and right-shift commands. Only 2 store instructions needed
- // because we are using the fact that 1/3 are stored just after 0/2.
- {
- _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
- _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
- }
+ store_output(&in0, output);
}
-
static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
int stride) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
@@ -264,7 +65,7 @@ static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
}
-static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
const __m128i kOne = _mm_set1_epi16(1);
__m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
__m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
@@ -272,8 +73,8 @@ static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
__m128i out23 = _mm_add_epi16(in23, kOne);
out01 = _mm_srai_epi16(out01, 2);
out23 = _mm_srai_epi16(out23, 2);
- _mm_store_si128((__m128i *)(output + 0 * 8), out01);
- _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+ store_output(&out01, (output + 0 * 8));
+ store_output(&out23, (output + 1 * 8));
}
static INLINE void transpose_4x4(__m128i *res) {
@@ -296,7 +97,7 @@ static INLINE void transpose_4x4(__m128i *res) {
}
void fdct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -333,7 +134,7 @@ void fadst4_sse2(__m128i *in) {
const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
const __m128i kZero = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
@@ -376,7 +177,7 @@ void fadst4_sse2(__m128i *in) {
transpose_4x4(in);
}
-void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in[4];
@@ -408,7 +209,7 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
}
}
-void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
__m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
__m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
__m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
@@ -445,16 +246,25 @@ void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
in0 = _mm_srli_si128(sum, 8);
in1 = _mm_add_epi32(sum, in0);
- _mm_store_si128((__m128i *)(output), in1);
+ store_output(&in1, output);
}
-void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
+ int16_t* coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t* zbin_ptr,
+ const int16_t* round_ptr, const int16_t* quant_ptr,
+ const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+ int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+ uint16_t* eob_ptr,
+ const int16_t* scan_ptr,
+ const int16_t* iscan_ptr) {
+ __m128i zero;
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -472,6 +282,14 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
__m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
__m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
__m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i *in[8];
+ int index = 0;
+
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)coeff_ptr;
+
// Pre-condition input (shift by two)
in0 = _mm_slli_epi16(in0, 2);
in1 = _mm_slli_epi16(in1, 2);
@@ -482,6 +300,15 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
in6 = _mm_slli_epi16(in6, 2);
in7 = _mm_slli_epi16(in7, 2);
+ in[0] = &in0;
+ in[1] = &in1;
+ in[2] = &in2;
+ in[3] = &in3;
+ in[4] = &in4;
+ in[5] = &in5;
+ in[6] = &in6;
+ in[7] = &in7;
+
// We do two passes, first the columns, then the rows. The results of the
// first pass are transposed so that the same column code can be reused. The
// results of the second pass are also transposed so that the rows (processed
@@ -692,15 +519,175 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
in5 = _mm_srai_epi16(in5, 1);
in6 = _mm_srai_epi16(in6, 1);
in7 = _mm_srai_epi16(in7, 1);
- // store results
- _mm_store_si128((__m128i *)(output + 0 * 8), in0);
- _mm_store_si128((__m128i *)(output + 1 * 8), in1);
- _mm_store_si128((__m128i *)(output + 2 * 8), in2);
- _mm_store_si128((__m128i *)(output + 3 * 8), in3);
- _mm_store_si128((__m128i *)(output + 4 * 8), in4);
- _mm_store_si128((__m128i *)(output + 5 * 8), in5);
- _mm_store_si128((__m128i *)(output + 6 * 8), in6);
- _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+ }
+
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+
+ if (!skip_block) {
+ __m128i eob;
+ __m128i round, quant, dequant;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ round = _mm_load_si128((const __m128i*)round_ptr);
+ quant = _mm_load_si128((const __m128i*)quant_ptr);
+ dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = *in[0];
+ coeff1 = *in[1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // AC only loop
+ index = 2;
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+ coeff0 = *in[index];
+ coeff1 = *in[index + 1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ index += 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
}
}
@@ -727,9 +714,7 @@ static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
}
// right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, int const bit) {
- const __m128i kOne = _mm_set1_epi16(1);
- const int bit_m02 = bit - 2;
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
__m128i sign0 = _mm_srai_epi16(res[0], 15);
__m128i sign1 = _mm_srai_epi16(res[1], 15);
__m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -739,16 +724,16 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
__m128i sign6 = _mm_srai_epi16(res[6], 15);
__m128i sign7 = _mm_srai_epi16(res[7], 15);
- if (bit_m02 >= 0) {
- __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
- res[0] = _mm_add_epi16(res[0], k_const_rounding);
- res[1] = _mm_add_epi16(res[1], k_const_rounding);
- res[2] = _mm_add_epi16(res[2], k_const_rounding);
- res[3] = _mm_add_epi16(res[3], k_const_rounding);
- res[4] = _mm_add_epi16(res[4], k_const_rounding);
- res[5] = _mm_add_epi16(res[5], k_const_rounding);
- res[6] = _mm_add_epi16(res[6], k_const_rounding);
- res[7] = _mm_add_epi16(res[7], k_const_rounding);
+ if (bit == 2) {
+ const __m128i const_rounding = _mm_set1_epi16(1);
+ res[0] = _mm_add_epi16(res[0], const_rounding);
+ res[1] = _mm_add_epi16(res[1], const_rounding);
+ res[2] = _mm_add_epi16(res[2], const_rounding);
+ res[3] = _mm_add_epi16(res[3], const_rounding);
+ res[4] = _mm_add_epi16(res[4], const_rounding);
+ res[5] = _mm_add_epi16(res[5], const_rounding);
+ res[6] = _mm_add_epi16(res[6], const_rounding);
+ res[7] = _mm_add_epi16(res[7], const_rounding);
}
res[0] = _mm_sub_epi16(res[0], sign0);
@@ -760,31 +745,95 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
res[6] = _mm_sub_epi16(res[6], sign6);
res[7] = _mm_sub_epi16(res[7], sign7);
- res[0] = _mm_srai_epi16(res[0], bit);
- res[1] = _mm_srai_epi16(res[1], bit);
- res[2] = _mm_srai_epi16(res[2], bit);
- res[3] = _mm_srai_epi16(res[3], bit);
- res[4] = _mm_srai_epi16(res[4], bit);
- res[5] = _mm_srai_epi16(res[5], bit);
- res[6] = _mm_srai_epi16(res[6], bit);
- res[7] = _mm_srai_epi16(res[7], bit);
+ if (bit == 1) {
+ res[0] = _mm_srai_epi16(res[0], 1);
+ res[1] = _mm_srai_epi16(res[1], 1);
+ res[2] = _mm_srai_epi16(res[2], 1);
+ res[3] = _mm_srai_epi16(res[3], 1);
+ res[4] = _mm_srai_epi16(res[4], 1);
+ res[5] = _mm_srai_epi16(res[5], 1);
+ res[6] = _mm_srai_epi16(res[6], 1);
+ res[7] = _mm_srai_epi16(res[7], 1);
+ } else {
+ res[0] = _mm_srai_epi16(res[0], 2);
+ res[1] = _mm_srai_epi16(res[1], 2);
+ res[2] = _mm_srai_epi16(res[2], 2);
+ res[3] = _mm_srai_epi16(res[3], 2);
+ res[4] = _mm_srai_epi16(res[4], 2);
+ res[5] = _mm_srai_epi16(res[5], 2);
+ res[6] = _mm_srai_epi16(res[6], 2);
+ res[7] = _mm_srai_epi16(res[7], 2);
+ }
}
// write 8x8 array
-static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
- _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
- _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
- _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
- _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
- _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
- _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
- _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
- _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+ int stride) {
+ store_output(&res[0], (output + 0 * stride));
+ store_output(&res[1], (output + 1 * stride));
+ store_output(&res[2], (output + 2 * stride));
+ store_output(&res[3], (output + 3 * stride));
+ store_output(&res[4], (output + 4 * stride));
+ store_output(&res[5], (output + 5 * stride));
+ store_output(&res[6], (output + 6 * stride));
+ store_output(&res[7], (output + 7 * stride));
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 44 54 45 55 46 56 47 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 25 35
+ // 44 54 64 74 45 55 65 75
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
}
void fdct8_sse2(__m128i *in) {
// constants
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -936,7 +985,7 @@ void fadst8_sse2(__m128i *in) {
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__const_0 = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1152,7 +1201,7 @@ void fadst8_sse2(__m128i *in) {
array_transpose_8x8(in, in);
}
-void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in[8];
@@ -1187,7 +1236,8 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
}
}
-void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
__m128i in0, in1, in2, in3;
__m128i u0, u1;
__m128i sum = _mm_setzero_si128();
@@ -1252,632 +1302,7 @@ void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
in1 = _mm_add_epi32(sum, in0);
in1 = _mm_srai_epi32(in1, 1);
- _mm_store_si128((__m128i *)(output), in1);
-}
-
-void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
- const int16_t *in = input;
- int16_t *out = intermediate;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
- for (column_start = 0; column_start < 16; column_start += 8) {
- __m128i in00, in01, in02, in03, in04, in05, in06, in07;
- __m128i in08, in09, in10, in11, in12, in13, in14, in15;
- __m128i input0, input1, input2, input3, input4, input5, input6, input7;
- __m128i step1_0, step1_1, step1_2, step1_3;
- __m128i step1_4, step1_5, step1_6, step1_7;
- __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- __m128i step3_0, step3_1, step3_2, step3_3;
- __m128i step3_4, step3_5, step3_6, step3_7;
- __m128i res00, res01, res02, res03, res04, res05, res06, res07;
- __m128i res08, res09, res10, res11, res12, res13, res14, res15;
- // Load and pre-condition input.
- if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
- // x = x << 2
- in00 = _mm_slli_epi16(in00, 2);
- in01 = _mm_slli_epi16(in01, 2);
- in02 = _mm_slli_epi16(in02, 2);
- in03 = _mm_slli_epi16(in03, 2);
- in04 = _mm_slli_epi16(in04, 2);
- in05 = _mm_slli_epi16(in05, 2);
- in06 = _mm_slli_epi16(in06, 2);
- in07 = _mm_slli_epi16(in07, 2);
- in08 = _mm_slli_epi16(in08, 2);
- in09 = _mm_slli_epi16(in09, 2);
- in10 = _mm_slli_epi16(in10, 2);
- in11 = _mm_slli_epi16(in11, 2);
- in12 = _mm_slli_epi16(in12, 2);
- in13 = _mm_slli_epi16(in13, 2);
- in14 = _mm_slli_epi16(in14, 2);
- in15 = _mm_slli_epi16(in15, 2);
- } else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
- // x = (x + 1) >> 2
- in00 = _mm_add_epi16(in00, kOne);
- in01 = _mm_add_epi16(in01, kOne);
- in02 = _mm_add_epi16(in02, kOne);
- in03 = _mm_add_epi16(in03, kOne);
- in04 = _mm_add_epi16(in04, kOne);
- in05 = _mm_add_epi16(in05, kOne);
- in06 = _mm_add_epi16(in06, kOne);
- in07 = _mm_add_epi16(in07, kOne);
- in08 = _mm_add_epi16(in08, kOne);
- in09 = _mm_add_epi16(in09, kOne);
- in10 = _mm_add_epi16(in10, kOne);
- in11 = _mm_add_epi16(in11, kOne);
- in12 = _mm_add_epi16(in12, kOne);
- in13 = _mm_add_epi16(in13, kOne);
- in14 = _mm_add_epi16(in14, kOne);
- in15 = _mm_add_epi16(in15, kOne);
- in00 = _mm_srai_epi16(in00, 2);
- in01 = _mm_srai_epi16(in01, 2);
- in02 = _mm_srai_epi16(in02, 2);
- in03 = _mm_srai_epi16(in03, 2);
- in04 = _mm_srai_epi16(in04, 2);
- in05 = _mm_srai_epi16(in05, 2);
- in06 = _mm_srai_epi16(in06, 2);
- in07 = _mm_srai_epi16(in07, 2);
- in08 = _mm_srai_epi16(in08, 2);
- in09 = _mm_srai_epi16(in09, 2);
- in10 = _mm_srai_epi16(in10, 2);
- in11 = _mm_srai_epi16(in11, 2);
- in12 = _mm_srai_epi16(in12, 2);
- in13 = _mm_srai_epi16(in13, 2);
- in14 = _mm_srai_epi16(in14, 2);
- in15 = _mm_srai_epi16(in15, 2);
- }
- in += 8;
- // Calculate input for the first 8 results.
- {
- input0 = _mm_add_epi16(in00, in15);
- input1 = _mm_add_epi16(in01, in14);
- input2 = _mm_add_epi16(in02, in13);
- input3 = _mm_add_epi16(in03, in12);
- input4 = _mm_add_epi16(in04, in11);
- input5 = _mm_add_epi16(in05, in10);
- input6 = _mm_add_epi16(in06, in09);
- input7 = _mm_add_epi16(in07, in08);
- }
- // Calculate input for the next 8 results.
- {
- step1_0 = _mm_sub_epi16(in07, in08);
- step1_1 = _mm_sub_epi16(in06, in09);
- step1_2 = _mm_sub_epi16(in05, in10);
- step1_3 = _mm_sub_epi16(in04, in11);
- step1_4 = _mm_sub_epi16(in03, in12);
- step1_5 = _mm_sub_epi16(in02, in13);
- step1_6 = _mm_sub_epi16(in01, in14);
- step1_7 = _mm_sub_epi16(in00, in15);
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(input0, input7);
- const __m128i q1 = _mm_add_epi16(input1, input6);
- const __m128i q2 = _mm_add_epi16(input2, input5);
- const __m128i q3 = _mm_add_epi16(input3, input4);
- const __m128i q4 = _mm_sub_epi16(input3, input4);
- const __m128i q5 = _mm_sub_epi16(input2, input5);
- const __m128i q6 = _mm_sub_epi16(input1, input6);
- const __m128i q7 = _mm_sub_epi16(input0, input7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res00 = _mm_packs_epi32(w0, w1);
- res08 = _mm_packs_epi32(w2, w3);
- res04 = _mm_packs_epi32(w4, w5);
- res12 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res02 = _mm_packs_epi32(w0, w1);
- res14 = _mm_packs_epi32(w2, w3);
- res10 = _mm_packs_epi32(w4, w5);
- res06 = _mm_packs_epi32(w6, w7);
- }
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_2 = _mm_packs_epi32(w0, w1);
- step2_3 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_5 = _mm_packs_epi32(w0, w1);
- step2_4 = _mm_packs_epi32(w2, w3);
- }
- // step 3
- {
- step3_0 = _mm_add_epi16(step1_0, step2_3);
- step3_1 = _mm_add_epi16(step1_1, step2_2);
- step3_2 = _mm_sub_epi16(step1_1, step2_2);
- step3_3 = _mm_sub_epi16(step1_0, step2_3);
- step3_4 = _mm_sub_epi16(step1_7, step2_4);
- step3_5 = _mm_sub_epi16(step1_6, step2_5);
- step3_6 = _mm_add_epi16(step1_6, step2_5);
- step3_7 = _mm_add_epi16(step1_7, step2_4);
- }
- // step 4
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_1 = _mm_packs_epi32(w0, w1);
- step2_2 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_6 = _mm_packs_epi32(w0, w1);
- step2_5 = _mm_packs_epi32(w2, w3);
- }
- // step 5
- {
- step1_0 = _mm_add_epi16(step3_0, step2_1);
- step1_1 = _mm_sub_epi16(step3_0, step2_1);
- step1_2 = _mm_add_epi16(step3_3, step2_2);
- step1_3 = _mm_sub_epi16(step3_3, step2_2);
- step1_4 = _mm_sub_epi16(step3_4, step2_5);
- step1_5 = _mm_add_epi16(step3_4, step2_5);
- step1_6 = _mm_sub_epi16(step3_7, step2_6);
- step1_7 = _mm_add_epi16(step3_7, step2_6);
- }
- // step 6
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res01 = _mm_packs_epi32(w0, w1);
- res09 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res05 = _mm_packs_epi32(w0, w1);
- res13 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res11 = _mm_packs_epi32(w0, w1);
- res03 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res15 = _mm_packs_epi32(w0, w1);
- res07 = _mm_packs_epi32(w2, w3);
- }
- }
- // Transpose the results, do it as two 8x8 transposes.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
- _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
- _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
- _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
- _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
- _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
- _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
- _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
- }
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- // Store results
- _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
- _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
- _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
- _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
- _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
- _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
- _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
- _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
- }
- out += 8*16;
- }
- // Setup in/out for next pass.
- in = intermediate;
- out = output;
- }
+ store_output(&in1, output);
}
static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
@@ -1892,7 +1317,7 @@ static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
}
-static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
__m128i *in1, int stride) {
// write first 8 columns
write_buffer_8x8(output, in0, stride);
@@ -1903,6 +1328,23 @@ static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
}
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
// perform rounding operations
right_shift_8x8(res0, 2);
@@ -1914,7 +1356,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
void fdct16_8col(__m128i *in) {
// perform 16x16 1-D DCT for 8 columns
__m128i i[8], s[8], p[8], t[8], u[16], v[16];
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -2261,8 +1703,8 @@ void fadst16_8col(__m128i *in) {
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2715,7 +2157,7 @@ void fadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
}
-void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
@@ -2750,7 +2192,8 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
}
}
-void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
__m128i in0, in1, in2, in3;
__m128i u0, u1;
__m128i sum = _mm_setzero_si128();
@@ -2818,17 +2261,167 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
in1 = _mm_add_epi32(sum, in0);
in1 = _mm_srai_epi32(in1, 3);
- _mm_store_si128((__m128i *)(output), in1);
+ store_output(&in1, output);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+/* These SSE2 versions of the FHT functions only actually use SSE2 in the
+ * DCT_DCT case in all other cases, they revert to C code which is identical
+ * to that used by the C versions of them.
+ */
+
+void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vp9_highbd_fdct4x4_sse2(input, output, stride);
+ } else {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = &out[0];
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ const transform_2d ht = FHT_4[tx_type];
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = input[j * stride + i] * 16;
+ if (i == 0 && temp_in[0])
+ temp_in[0] += 1;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 4; ++j)
+ outptr[j * 4 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j + i * 4];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 4; ++j)
+ output[j + i * 4] = (temp_out[j] + 1) >> 2;
+ }
+ }
+}
+
+void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vp9_highbd_fdct8x8_sse2(input, output, stride);
+ } else {
+ tran_low_t out[64];
+ tran_low_t *outptr = &out[0];
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ const transform_2d ht = FHT_8[tx_type];
+
+ // Columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 8; ++j)
+ outptr[j * 8 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j + i * 8];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 8; ++j)
+ output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ }
+}
+
+void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vp9_highbd_fdct16x16_sse2(input, output, stride);
+ } else {
+ tran_low_t out[256];
+ tran_low_t *outptr = &out[0];
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ const transform_2d ht = FHT_16[tx_type];
+
+ // Columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j + i * 16];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ output[j + i * 16] = temp_out[j];
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+/*
+ * The DCTnxn functions are defined using the macros below. The main code for
+ * them is in separate files (vp9/encoder/x86/vp9_dct_impl_sse2.c &
+ * vp9/encoder/x86/vp9_dct32x32_sse2.c) which are used by both the 8 bit code
+ * and the high bit depth code.
+ */
+
+#define DCT_HIGH_BIT_DEPTH 0
+
+#define FDCT4x4_2D vp9_fdct4x4_sse2
+#define FDCT8x8_2D vp9_fdct8x8_sse2
+#define FDCT16x16_2D vp9_fdct16x16_sse2
+#include "vp9/encoder/x86/vp9_dct_impl_sse2.c"
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
-#undef FDCT32x32_HIGH_PRECISION
#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
#define FDCT32x32_2D vp9_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
+#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
+
+#undef DCT_HIGH_BIT_DEPTH
+
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+#define DCT_HIGH_BIT_DEPTH 1
+
+#define FDCT4x4_2D vp9_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vp9_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vp9_highbd_fdct16x16_sse2
+#include "vp9/encoder/x86/vp9_dct_impl_sse2.c" // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
+
+#define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp9_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#undef DCT_HIGH_BIT_DEPTH
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h
new file mode 100644
index 00000000000..b99db923ef0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+ _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
+ int stride);
+void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
+ int stride);
+void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
+ int stride);
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+ const __m128i *preg1) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16(0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ cmp0 = _mm_or_si128(cmp0, cmp1);
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16(0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+ _mm_cmpeq_epi16(*preg2, min_overflow));
+ __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+ _mm_cmpeq_epi16(*preg3, min_overflow));
+ cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7,
+ const __m128i *preg8,
+ const __m128i *preg9,
+ const __m128i *preg10,
+ const __m128i *preg11) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0)
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7,
+ const __m128i *preg8,
+ const __m128i *preg9,
+ const __m128i *preg10,
+ const __m128i *preg11,
+ const __m128i *preg12,
+ const __m128i *preg13,
+ const __m128i *preg14,
+ const __m128i *preg15) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ }
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7,
+ const __m128i *preg8,
+ const __m128i *preg9,
+ const __m128i *preg10,
+ const __m128i *preg11,
+ const __m128i *preg12,
+ const __m128i *preg13,
+ const __m128i *preg14,
+ const __m128i *preg15,
+ const __m128i *preg16,
+ const __m128i *preg17,
+ const __m128i *preg18,
+ const __m128i *preg19,
+ const __m128i *preg20,
+ const __m128i *preg21,
+ const __m128i *preg22,
+ const __m128i *preg23,
+ const __m128i *preg24,
+ const __m128i *preg25,
+ const __m128i *preg26,
+ const __m128i *preg27,
+ const __m128i *preg28,
+ const __m128i *preg29,
+ const __m128i *preg30,
+ const __m128i *preg31) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+ }
+ }
+ }
+ }
+ }
+ return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *zero) {
+ __m128i minus_one = _mm_set1_epi32(-1);
+ // Check for overflows
+ __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+ __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+ __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+ __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+ __m128i reg0_top_dwords = _mm_shuffle_epi32(
+ reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg1_top_dwords = _mm_shuffle_epi32(
+ reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg2_top_dwords = _mm_shuffle_epi32(
+ reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i reg3_top_dwords = _mm_shuffle_epi32(
+ reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+ __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+ __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+ __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+ __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+ __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+ __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+ int overflow_01 = _mm_movemask_epi8(
+ _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+ int overflow_23 = _mm_movemask_epi8(
+ _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+ return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7,
+ const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7,
+ const __m128i *preg8,
+ const __m128i *preg9,
+ const __m128i *preg10,
+ const __m128i *preg11,
+ const __m128i *preg12,
+ const __m128i *preg13,
+ const __m128i *preg14,
+ const __m128i *preg15,
+ const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
+ zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+ zero);
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3,
+ const __m128i *preg4,
+ const __m128i *preg5,
+ const __m128i *preg6,
+ const __m128i *preg7,
+ const __m128i *preg8,
+ const __m128i *preg9,
+ const __m128i *preg10,
+ const __m128i *preg11,
+ const __m128i *preg12,
+ const __m128i *preg13,
+ const __m128i *preg14,
+ const __m128i *preg15,
+ const __m128i *preg16,
+ const __m128i *preg17,
+ const __m128i *preg18,
+ const __m128i *preg19,
+ const __m128i *preg20,
+ const __m128i *preg21,
+ const __m128i *preg22,
+ const __m128i *preg23,
+ const __m128i *preg24,
+ const __m128i *preg25,
+ const __m128i *preg26,
+ const __m128i *preg27,
+ const __m128i *preg28,
+ const __m128i *preg29,
+ const __m128i *preg30,
+ const __m128i *preg31,
+ const __m128i *zero) {
+ int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+ zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
+ zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg20, preg21,
+ preg22, preg23, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg24, preg25,
+ preg26, preg27, zero);
+ if (!overflow) {
+ overflow = k_check_epi32_overflow_4(preg28, preg29,
+ preg30, preg31, zero);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0,
+ const __m128i *pin1,
+ const __m128i *pmultiplier,
+ const __m128i *prounding,
+ const int shift) {
+ const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+ const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+ const __m128i v0 = _mm_add_epi32(u0, *prounding);
+ const __m128i v1 = _mm_add_epi32(u1, *prounding);
+ const __m128i w0 = _mm_srai_epi32(v0, shift);
+ const __m128i w1 = _mm_srai_epi32(v1, shift);
+ return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+ const __m128i *pin00, const __m128i *pin01,
+ const __m128i *pin02, const __m128i *pin03,
+ const __m128i *pin04, const __m128i *pin05,
+ const __m128i *pin06, const __m128i *pin07,
+ const int pass, int16_t* out0_ptr,
+ tran_low_t* out1_ptr) {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (pass == 0) {
+ _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
+ _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+ } else {
+ storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+ storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+ storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+ storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+ storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+ storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+ storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+ storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP9_ENCODER_X86_VP9_DCT_SSE2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
new file mode 100644
index 00000000000..1c1005aeeda
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h> // SSSE3
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
+ int16_t* coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t* zbin_ptr,
+ const int16_t* round_ptr, const int16_t* quant_ptr,
+ const int16_t* quant_shift_ptr,
+ int16_t* qcoeff_ptr,
+ int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+ uint16_t* eob_ptr,
+ const int16_t* scan_ptr,
+ const int16_t* iscan_ptr) {
+ __m128i zero;
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i *in[8];
+ int index = 0;
+
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)coeff_ptr;
+
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ in[0] = &in0;
+ in[1] = &in1;
+ in[2] = &in2;
+ in[3] = &in3;
+ in[4] = &in4;
+ in[5] = &in5;
+ in[6] = &in6;
+ in[7] = &in7;
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = _mm_add_epi16(in0, in7);
+ const __m128i q1 = _mm_add_epi16(in1, in6);
+ const __m128i q2 = _mm_add_epi16(in2, in5);
+ const __m128i q3 = _mm_add_epi16(in3, in4);
+ const __m128i q4 = _mm_sub_epi16(in3, in4);
+ const __m128i q5 = _mm_sub_epi16(in2, in5);
+ const __m128i q6 = _mm_sub_epi16(in1, in6);
+ const __m128i q7 = _mm_sub_epi16(in0, in7);
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = _mm_add_epi16(q0, q3);
+ const __m128i r1 = _mm_add_epi16(q1, q2);
+ const __m128i r2 = _mm_sub_epi16(q1, q2);
+ const __m128i r3 = _mm_sub_epi16(q0, q3);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_sub_epi16(q6, q5);
+ const __m128i d1 = _mm_add_epi16(q6, q5);
+ const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+ const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+ // Add/subtract
+ const __m128i x0 = _mm_add_epi16(q4, r0);
+ const __m128i x1 = _mm_sub_epi16(q4, r0);
+ const __m128i x2 = _mm_sub_epi16(q7, r1);
+ const __m128i x3 = _mm_add_epi16(q7, r1);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ }
+
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+
+ if (!skip_block) {
+ __m128i eob;
+ __m128i round, quant, dequant, thr;
+ int16_t nzflag;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ round = _mm_load_si128((const __m128i*)round_ptr);
+ quant = _mm_load_si128((const __m128i*)quant_ptr);
+ dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = *in[0];
+ coeff1 = *in[1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // AC only loop
+ index = 2;
+ thr = _mm_srai_epi16(dequant, 1);
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+ coeff0 = *in[index];
+ coeff1 = *in[index + 1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
+ }
+
+ if (nzflag) {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ index += 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 28458dcdd52..3a29aba6f27 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -179,4 +179,77 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
mova [outputq + 112], m7
RET
+
+%macro HMD8_1D 0
+ psubw m8, m0, m1
+ psubw m9, m2, m3
+ paddw m0, m1
+ paddw m2, m3
+ SWAP 1, 8
+ SWAP 3, 9
+ psubw m8, m4, m5
+ psubw m9, m6, m7
+ paddw m4, m5
+ paddw m6, m7
+ SWAP 5, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m2
+ psubw m9, m1, m3
+ paddw m0, m2
+ paddw m1, m3
+ SWAP 2, 8
+ SWAP 3, 9
+ psubw m8, m4, m6
+ psubw m9, m5, m7
+ paddw m4, m6
+ paddw m5, m7
+ SWAP 6, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m4
+ psubw m9, m1, m5
+ paddw m0, m4
+ paddw m1, m5
+ SWAP 4, 8
+ SWAP 5, 9
+ psubw m8, m2, m6
+ psubw m9, m3, m7
+ paddw m2, m6
+ paddw m3, m7
+ SWAP 6, 8
+ SWAP 7, 9
+%endmacro
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ HMD8_1D
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+ HMD8_1D
+
+ mova [outputq + 0], m0
+ mova [outputq + 16], m1
+ mova [outputq + 32], m2
+ mova [outputq + 48], m3
+ mova [outputq + 64], m4
+ mova [outputq + 80], m5
+ mova [outputq + 96], m6
+ mova [outputq + 112], m7
+
+ RET
%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
index bf5fa889f22..bf7c7af7707 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -23,18 +23,17 @@
// Compute the sum of all pixel differences of this MB.
static INLINE int sum_diff_16x1(__m128i acc_diff) {
const __m128i k_1 = _mm_set1_epi16(1);
- const __m128i acc_diff_lo = _mm_srai_epi16(
- _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
- const __m128i acc_diff_hi = _mm_srai_epi16(
- _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_lo =
+ _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_hi =
+ _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
- const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
- _mm_srli_si128(hg_fe_dc_ba, 8));
- const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
- _mm_srli_si128(hgfe_dcba, 4));
- int sum_diff = _mm_cvtsi128_si32(hgfedcba);
- return sum_diff;
+ const __m128i hgfe_dcba =
+ _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+ const __m128i hgfedcba =
+ _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+ return _mm_cvtsi128_si32(hgfedcba);
}
// Denoise a 16x1 vector.
@@ -51,8 +50,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
__m128i acc_diff) {
// Calculate differences
const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
- const __m128i v_mc_running_avg_y = _mm_loadu_si128(
- (const __m128i *)(&mc_running_avg_y[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
__m128i v_running_avg_y;
const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
@@ -60,8 +59,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
// Clamp absolute difference to 16 to be used to get mask. Doing this
// allows us to use _mm_cmpgt_epi8, which operates on signed byte.
- const __m128i clamped_absdiff = _mm_min_epu8(
- _mm_or_si128(pdiff, ndiff), *k_16);
+ const __m128i clamped_absdiff =
+ _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
// Get masks for l2 l1 and l0 adjustments.
const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
@@ -95,24 +94,22 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
}
// Denoise a 16x1 vector with a weaker filter.
-static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
- const uint8_t *mc_running_avg_y,
- uint8_t *running_avg_y,
- const __m128i k_0,
- const __m128i k_delta,
- __m128i acc_diff) {
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y,
+ uint8_t *running_avg_y, const __m128i k_0,
+ const __m128i k_delta, __m128i acc_diff) {
__m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
// Calculate differences.
const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
const __m128i v_mc_running_avg_y =
- _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
// Obtain the sign. FF if diff is negative.
const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
// Clamp absolute difference to delta to get the adjustment.
const __m128i adj =
- _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+ _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
// Restore the sign and get positive and negative adjustments.
__m128i padj, nadj;
padj = _mm_andnot_si128(diff_sign, adj);
@@ -128,19 +125,16 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
return acc_diff;
}
-static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
- const uint8_t *mc_running_avg_y,
- int mc_avg_y_stride,
- uint8_t *running_avg_y, int avg_y_stride,
- int increase_denoising,
- BLOCK_SIZE bs,
- int motion_magnitude) {
- int sum_diff_thresh;
- int r;
- int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
- unsigned char sig_buffer[2][16], mc_running_buffer[2][16],
- running_buffer[2][16];
+// Denoiser for 4xM and 8xM blocks.
+static int vp9_denoiser_NxM_sse2_small(
+ const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc = (increase_denoising &&
+ motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+ 1 : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
__m128i acc_diff = _mm_setzero_si128();
const __m128i k_0 = _mm_setzero_si128();
const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -148,145 +142,50 @@ static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
const __m128i k_16 = _mm_set1_epi8(16);
// Modify each level's adjustment according to motion_magnitude.
const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 7 + shift_inc : 6);
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
// Difference between level 3 and level 2 is 2.
const __m128i l32 = _mm_set1_epi8(2);
// Difference between level 2 and level 1 is 1.
const __m128i l21 = _mm_set1_epi8(1);
- int sum_diff = 0;
-
- for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
- vpx_memcpy(sig_buffer[r], sig, 4);
- vpx_memcpy(sig_buffer[r] + 4, sig + sig_stride, 4);
- vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride * 2, 4);
- vpx_memcpy(sig_buffer[r] + 12, sig + sig_stride * 3, 4);
- vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 4);
- vpx_memcpy(mc_running_buffer[r] + 4, mc_running_avg_y +
- mc_avg_y_stride, 4);
- vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
- mc_avg_y_stride * 2, 4);
- vpx_memcpy(mc_running_buffer[r] + 12, mc_running_avg_y +
- mc_avg_y_stride * 3, 4);
- vpx_memcpy(running_buffer[r], running_avg_y, 4);
- vpx_memcpy(running_buffer[r] + 4, running_avg_y +
- avg_y_stride, 4);
- vpx_memcpy(running_buffer[r] + 8, running_avg_y +
- avg_y_stride * 2, 4);
- vpx_memcpy(running_buffer[r] + 12, running_avg_y +
- avg_y_stride * 3, 4);
- acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
- mc_running_buffer[r],
- running_buffer[r],
- &k_0, &k_4, &k_8, &k_16,
- &l3, &l32, &l21, acc_diff);
- vpx_memcpy(running_avg_y, running_buffer[r], 4);
- vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
- vpx_memcpy(running_avg_y + avg_y_stride * 2,
- running_buffer[r] + 8, 4);
- vpx_memcpy(running_avg_y + avg_y_stride * 3,
- running_buffer[r] + 12, 4);
- // Update pointers for next iteration.
- sig += (sig_stride << 2);
- mc_running_avg_y += (mc_avg_y_stride << 2);
- running_avg_y += (avg_y_stride << 2);
- }
-
- {
- sum_diff = sum_diff_16x1(acc_diff);
- sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
- if (abs(sum_diff) > sum_diff_thresh) {
- // Before returning to copy the block (i.e., apply no denoising),
- // checK if we can still apply some (weaker) temporal filtering to
- // this block, that would otherwise not be denoised at all. Simplest
- // is to apply an additional adjustment to running_avg_y to bring it
- // closer to sig. The adjustment is capped by a maximum delta, and
- // chosen such that in most cases the resulting sum_diff will be
- // within the accceptable range given by sum_diff_thresh.
+ const uint8_t shift = (width == 4) ? 2 : 1;
- // The delta is set by the excess of absolute pixel diff over the
- // threshold.
- int delta = ((abs(sum_diff) - sum_diff_thresh)
- >> num_pels_log2_lookup[bs]) + 1;
- // Only apply the adjustment for max delta up to 3.
- if (delta < 4) {
- const __m128i k_delta = _mm_set1_epi8(delta);
- running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
- sum_diff = 0;
- for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
- acc_diff = vp9_denoiser_adj_16x1_sse2(
- sig_buffer[r], mc_running_buffer[r],
- running_buffer[r], k_0, k_delta,
- acc_diff);
- vpx_memcpy(running_avg_y, running_buffer[r], 4);
- vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
- vpx_memcpy(running_avg_y + avg_y_stride * 2,
- running_buffer[r] + 8, 4);
- vpx_memcpy(running_avg_y + avg_y_stride * 3,
- running_buffer[r] + 12, 4);
- // Update pointers for next iteration.
- running_avg_y += (avg_y_stride << 2);
- }
- sum_diff = sum_diff_16x1(acc_diff);
- if (abs(sum_diff) > sum_diff_thresh) {
- return COPY_BLOCK;
- }
- } else {
- return COPY_BLOCK;
- }
+ for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width,
+ mc_running_avg_y + mc_avg_y_stride, width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ if (width == 4) {
+ memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
+ memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
+ memcpy(mc_running_buffer[r] + width * 2,
+ mc_running_avg_y + mc_avg_y_stride * 2, width);
+ memcpy(mc_running_buffer[r] + width * 3,
+ mc_running_avg_y + mc_avg_y_stride * 3, width);
+ memcpy(running_buffer[r] + width * 2,
+ running_avg_y + avg_y_stride * 2, width);
+ memcpy(running_buffer[r] + width * 3,
+ running_avg_y + avg_y_stride * 3, width);
}
- }
- return FILTER_BLOCK;
-}
-
-static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
- const uint8_t *mc_running_avg_y,
- int mc_avg_y_stride,
- uint8_t *running_avg_y, int avg_y_stride,
- int increase_denoising,
- BLOCK_SIZE bs,
- int motion_magnitude) {
- int sum_diff_thresh;
- int r;
- int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
- unsigned char sig_buffer[8][16], mc_running_buffer[8][16],
- running_buffer[8][16];
- __m128i acc_diff = _mm_setzero_si128();
- const __m128i k_0 = _mm_setzero_si128();
- const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
- const __m128i k_8 = _mm_set1_epi8(8);
- const __m128i k_16 = _mm_set1_epi8(16);
- // Modify each level's adjustment according to motion_magnitude.
- const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 7 + shift_inc : 6);
- // Difference between level 3 and level 2 is 2.
- const __m128i l32 = _mm_set1_epi8(2);
- // Difference between level 2 and level 1 is 1.
- const __m128i l21 = _mm_set1_epi8(1);
- int sum_diff = 0;
-
- for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
- vpx_memcpy(sig_buffer[r], sig, 8);
- vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride, 8);
- vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 8);
- vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
- mc_avg_y_stride, 8);
- vpx_memcpy(running_buffer[r], running_avg_y, 8);
- vpx_memcpy(running_buffer[r] + 8, running_avg_y +
- avg_y_stride, 8);
acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
mc_running_buffer[r],
running_buffer[r],
&k_0, &k_4, &k_8, &k_16,
&l3, &l32, &l21, acc_diff);
- vpx_memcpy(running_avg_y, running_buffer[r], 8);
- vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+ if (width == 4) {
+ memcpy(running_avg_y + avg_y_stride * 2,
+ running_buffer[r] + width * 2, width);
+ memcpy(running_avg_y + avg_y_stride * 3,
+ running_buffer[r] + width * 3, width);
+ }
// Update pointers for next iteration.
- sig += (sig_stride << 1);
- mc_running_avg_y += (mc_avg_y_stride << 1);
- running_avg_y += (avg_y_stride << 1);
+ sig += (sig_stride << shift);
+ mc_running_avg_y += (mc_avg_y_stride << shift);
+ running_avg_y += (avg_y_stride << shift);
}
{
@@ -294,54 +193,61 @@ static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
if (abs(sum_diff) > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising),
- // checK if we can still apply some (weaker) temporal filtering to
+ // check if we can still apply some (weaker) temporal filtering to
// this block, that would otherwise not be denoised at all. Simplest
// is to apply an additional adjustment to running_avg_y to bring it
// closer to sig. The adjustment is capped by a maximum delta, and
// chosen such that in most cases the resulting sum_diff will be
- // within the accceptable range given by sum_diff_thresh.
+ // within the acceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over the
// threshold.
- int delta = ((abs(sum_diff) - sum_diff_thresh)
- >> num_pels_log2_lookup[bs]) + 1;
+ const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+ num_pels_log2_lookup[bs]) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
const __m128i k_delta = _mm_set1_epi8(delta);
running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
- for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+ for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
acc_diff = vp9_denoiser_adj_16x1_sse2(
- sig_buffer[r], mc_running_buffer[r],
- running_buffer[r], k_0, k_delta,
- acc_diff);
- vpx_memcpy(running_avg_y, running_buffer[r], 8);
- vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+ k_0, k_delta, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride,
+ running_buffer[r] + width, width);
+ if (width == 4) {
+ memcpy(running_avg_y + avg_y_stride * 2,
+ running_buffer[r] + width * 2, width);
+ memcpy(running_avg_y + avg_y_stride * 3,
+ running_buffer[r] + width * 3, width);
+ }
// Update pointers for next iteration.
- running_avg_y += (avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << shift);
}
sum_diff = sum_diff_16x1(acc_diff);
if (abs(sum_diff) > sum_diff_thresh) {
return COPY_BLOCK;
}
} else {
- return COPY_BLOCK;
+ return COPY_BLOCK;
}
}
}
return FILTER_BLOCK;
}
-static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
- const uint8_t *mc_running_avg_y,
- int mc_avg_y_stride,
- uint8_t *running_avg_y,
- int avg_y_stride,
- int increase_denoising, BLOCK_SIZE bs,
- int motion_magnitude) {
- int sum_diff_thresh;
- int r, c;
- int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+// Denoiser for 16xM, 32xM and 64xM blocks
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y,
+ int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ int sum_diff_thresh, r, c, sum_diff = 0;
+ const int shift_inc = (increase_denoising &&
+ motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+ 1 : 0;
__m128i acc_diff[4][4];
const __m128i k_0 = _mm_setzero_si128();
const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -349,13 +255,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
const __m128i k_16 = _mm_set1_epi8(16);
// Modify each level's adjustment according to motion_magnitude.
const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 7 + shift_inc : 6);
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
// Difference between level 3 and level 2 is 2.
const __m128i l32 = _mm_set1_epi8(2);
// Difference between level 2 and level 1 is 1.
const __m128i l21 = _mm_set1_epi8(1);
- int sum_diff = 0;
for (c = 0; c < 4; ++c) {
for (r = 0; r < 4; ++r) {
@@ -363,13 +267,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
}
}
- for (r = 0; r < (4 << b_height_log2_lookup[bs]); r++) {
+ for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
- sig, mc_running_avg_y,
- running_avg_y,
- &k_0, &k_4, &k_8, &k_16,
- &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+ sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
+ &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
// Update pointers for next iteration.
sig += 16;
mc_running_avg_y += 16;
@@ -385,8 +287,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
// Update pointers for next iteration.
sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
mc_running_avg_y = mc_running_avg_y -
- 16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
- mc_avg_y_stride;
+ 16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+ mc_avg_y_stride;
running_avg_y = running_avg_y -
16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
avg_y_stride;
@@ -395,8 +297,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
{
sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
if (abs(sum_diff) > sum_diff_thresh) {
- int delta = ((abs(sum_diff) - sum_diff_thresh)
- >> num_pels_log2_lookup[bs]) + 1;
+ const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+ num_pels_log2_lookup[bs]) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
@@ -408,9 +310,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
- sig, mc_running_avg_y,
- running_avg_y, k_0,
- k_delta, acc_diff[c>>4][r>>4]);
+ sig, mc_running_avg_y, running_avg_y, k_0,
+ k_delta, acc_diff[c>>4][r>>4]);
// Update pointers for next iteration.
sig += 16;
mc_running_avg_y += 16;
@@ -449,25 +350,25 @@ int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
BLOCK_SIZE bs,
int motion_magnitude) {
if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
- return vp9_denoiser_4xM_sse2(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude);
+ return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+ mc_avg, mc_avg_stride,
+ avg, avg_stride,
+ increase_denoising,
+ bs, motion_magnitude, 4);
} else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
- return vp9_denoiser_8xM_sse2(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude);
+ return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+ mc_avg, mc_avg_stride,
+ avg, avg_stride,
+ increase_denoising,
+ bs, motion_magnitude, 8);
} else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
bs == BLOCK_64X32 || bs == BLOCK_64X64) {
- return vp9_denoiser_64_32_16xM_sse2(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude);
+ return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
+ mc_avg, mc_avg_stride,
+ avg, avg_stride,
+ increase_denoising,
+ bs, motion_magnitude);
} else {
return COPY_BLOCK;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 1126fdb6164..56373e897c9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
movd edx, m5
%endif
RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+ pxor m4, m4 ; sse accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ ; accumulate in 64bit
+ punpckldq m3, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m3
+ punpckldq m3, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m3
+ paddq m4, m1
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ paddq m4, m5
+%if ARCH_X86_64
+ movq rax, m4
+%else
+ pshufd m5, m4, 0x1
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 00000000000..c245ccafa8a
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i, j, test;
+ uint32_t temp[4];
+ __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i+=8) {
+ // Load the data into xmm registers
+ __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+ // Check if any values require more than 15 bit
+ max = _mm_set1_epi32(0x3fff);
+ min = _mm_set1_epi32(0xffffc000);
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+ _mm_cmplt_epi32(mm_coeff, min));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+ _mm_cmplt_epi32(mm_coeff2, min));
+ cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+ _mm_cmplt_epi32(mm_dqcoeff, min));
+ cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+ _mm_or_si128(cmp2, cmp3)));
+
+ if (!test) {
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+ mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+ mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+ mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+ error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+ sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+ _mm_storeu_si128((__m128i*)temp, error_sse2);
+ error = error + temp[0] + temp[1] + temp[2] + temp[3];
+ _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+ sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+ } else {
+ for (j = 0; j < 8; j++) {
+ const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+ }
+ }
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
new file mode 100644
index 00000000000..ffa43b65a59
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "vp9/common/vp9_common.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// from vp9_idct.h: typedef int32_t tran_low_t;
+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
+ intptr_t count,
+ int skip_block,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ uint16_t *eob_ptr,
+ const int16_t *scan,
+ const int16_t *iscan) {
+ int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+ __m128i zbins[2];
+ __m128i nzbins[2];
+
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
+ (int)zbin_ptr[1],
+ (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ (void)scan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
+
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(
+ _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
+ INT32_MIN, INT32_MAX);
+ tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
+ quant_shift_ptr[k != 0]) >> 16; // quantization
+ qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (tmp)
+ eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ }
+ }
+ }
+ }
+ *eob_ptr = eob_i + 1;
+}
+
+
+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs,
+ int skip_block,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ uint16_t *eob_ptr,
+ const int16_t *scan,
+ const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp,
+ zbin1_tmp,
+ zbin1_tmp,
+ zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf))
+ idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0))
+ idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00))
+ idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000))
+ idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp = clamp(abs_coeff +
+ ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+ INT32_MIN, INT32_MAX);
+ tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >> 15;
+
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+ if (tmp)
+ eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
new file mode 100644
index 00000000000..987729f962c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
@@ -0,0 +1,1055 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 15
+ times 8 dw 1
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 13
+ times 8 dw 3
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 11
+ times 8 dw 5
+ times 8 dw 10
+ times 8 dw 6
+ times 8 dw 9
+ times 8 dw 7
+ times 16 dw 8
+ times 8 dw 7
+ times 8 dw 9
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 5
+ times 8 dw 11
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 3
+ times 8 dw 13
+ times 8 dw 2
+ times 8 dw 14
+ times 8 dw 1
+ times 8 dw 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd rax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ lea srcq, [srcq + src_stridemp*2]
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro INC_SRC_BY_SRC_2STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ lea srcq, [srcq + src_stridemp*4]
+%else
+ lea srcq, [srcq + src_strideq*4]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define h heightd
+ %define bilin_filter sseq
+%else
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define h dword heightm
+ %define sec_str sec_stridemp
+
+ ; Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, \
+ sse, g_bilin_filter, g_pw_8
+ %define h heightd
+
+ ; Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define h heightd
+ %define sec_str sec_strideq
+ %else
+ %define h dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, sse
+ %define h heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar h, 1
+%endif
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [dstq]
+ mova m3, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [dstq]
+ mova m3, [dstq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [dstq]
+ mova m5, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [dstq]
+ mova m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [dstq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [dstq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_2STRIDE
+ lea dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
new file mode 100644
index 00000000000..821dd0660bc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@@ -0,0 +1,313 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
+sym(vp9_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
+sym(vp9_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c
new file mode 100644
index 00000000000..4bc3e7e2d15
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+} \
+\
+void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, \
+ uint32_t *sse, int *sum) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+ sse, sum); \
+ *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vp9_highbd_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+ block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, \
+ const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+ sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint16_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst8, \
+ int dst_stride, \
+ uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, h, \
+ &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+ src_stride, \
+ x_offset, y_offset, \
+ dst + 16, \
+ dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, \
+ dst + 48, dst_stride, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+ src_stride, \
+ x_offset, y_offset, \
+ dst + 16, \
+ dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ for (start_row = 0; start_row < h; start_row +=16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ }\
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint16_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint16_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, \
+ unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, sec, w, h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, \
+ dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, \
+ dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, \
+ dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row +=16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + (start_row * dst_stride), dst_stride, \
+ sec + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 16 + (start_row * dst_stride), dst_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 32 + (start_row * dst_stride), dst_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 48 + (start_row * dst_stride), dst_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 7c1c8843cdf..00abd3c4962 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -18,7 +18,7 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- int zbin_oq_value, uint16_t* eob_ptr,
+ uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
@@ -39,13 +39,10 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
// Setup global values
{
- __m128i zbin_oq;
__m128i pw_1;
- zbin_oq = _mm_set1_epi16(zbin_oq_value);
zbin = _mm_load_si128((const __m128i*)zbin_ptr);
round = _mm_load_si128((const __m128i*)round_ptr);
quant = _mm_load_si128((const __m128i*)quant_ptr);
- zbin = _mm_add_epi16(zbin, zbin_oq);
pw_1 = _mm_set1_epi16(1);
zbin = _mm_sub_epi16(zbin, pw_1);
dequant = _mm_load_si128((const __m128i*)dequant_ptr);
@@ -223,3 +220,199 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
*eob_ptr = 0;
}
}
+
+void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t* zbin_ptr,
+ const int16_t* round_ptr, const int16_t* quant_ptr,
+ const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+ int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+ uint16_t* eob_ptr,
+ const int16_t* scan_ptr,
+ const int16_t* iscan_ptr) {
+ __m128i zero;
+ __m128i thr;
+ int16_t nzflag;
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+
+ if (!skip_block) {
+ __m128i eob;
+ __m128i round, quant, dequant;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ round = _mm_load_si128((const __m128i*)round_ptr);
+ quant = _mm_load_si128((const __m128i*)quant_ptr);
+ dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+ coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+ coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
+ }
+
+ if (nzflag) {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 508e1d4f55a..449d52b22e7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -15,9 +15,10 @@ pw_1: times 8 dw 1
SECTION .text
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
%macro QUANTIZE_FN 2
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
@@ -29,13 +30,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
- movd m4, dword zbin_oqm ; m4 = zbin_oq
mova m0, [zbinq] ; m0 = zbin
- punpcklwd m4, m4
mova m1, [roundq] ; m1 = round
- pshufd m4, m4, 0
mova m2, [quantq] ; m2 = quant
- paddw m0, m4 ; m0 = zbin + zbin_oq
%ifidn %1, b_32x32
pcmpeqw m5, m5
psrlw m5, 15
@@ -55,7 +52,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psllw m4, 1
%endif
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -122,8 +119,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
%ifidn %1, b_32x32
- pmovmskb r6, m7
- pmovmskb r2, m12
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
or r6, r2
jz .skip_iter
%endif
@@ -220,7 +217,7 @@ QUANTIZE_FN b_32x32, 7
%macro QUANTIZE_FP 2
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
@@ -248,11 +245,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psllw m2, 1
%endif
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+
lea coeffq, [ coeffq+ncoeffq*2]
- lea iscanq, [ iscanq+ncoeffq*2]
- lea qcoeffq, [ qcoeffq+ncoeffq*2]
- lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ lea r5q, [ r5q+ncoeffq*2]
+ lea r3q, [ r3q+ncoeffq*2]
+ lea r4q, [r4q+ncoeffq*2]
neg ncoeffq
; get DC and first 15 AC coeffs
@@ -270,28 +267,30 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pmulhw m13, m11, m2 ; m13 = m11*q>>16
psignw m8, m9 ; m8 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
- mova [qcoeffq+ncoeffq*2+ 0], m8
- mova [qcoeffq+ncoeffq*2+16], m13
+ mova [r3q+ncoeffq*2+ 0], m8
+ mova [r3q+ncoeffq*2+16], m13
%ifidn %1, fp_32x32
pabsw m8, m8
pabsw m13, m13
%endif
- pmullw m8, m3 ; dqc[i] = qc[i] * q
+ pmullw m8, m3 ; r4[i] = r3[i] * q
punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; r4[i] = r3[i] * q
%ifidn %1, fp_32x32
psrlw m8, 1
psrlw m13, 1
psignw m8, m9
psignw m13, m10
psrlw m0, m3, 2
+%else
+ psrlw m0, m3, 1
%endif
- mova [dqcoeffq+ncoeffq*2+ 0], m8
- mova [dqcoeffq+ncoeffq*2+16], m13
+ mova [r4q+ncoeffq*2+ 0], m8
+ mova [r4q+ncoeffq*2+16], m13
pcmpeqw m8, m5 ; m8 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
psubw m6, m7 ; m6 = scan[i] + 1
psubw m11, m7 ; m11 = scan[i] + 1
pandn m8, m6 ; m8 = max(eob)
@@ -305,15 +304,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, fp_32x32
+
pcmpgtw m7, m6, m0
pcmpgtw m12, m11, m0
- pmovmskb r6, m7
- pmovmskb r2, m12
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
or r6, r2
jz .skip_iter
-%endif
+
pcmpeqw m7, m7
paddsw m6, m1 ; m6 += round
@@ -322,26 +321,26 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pmulhw m13, m11, m2 ; m13 = m11*q>>16
psignw m14, m9 ; m14 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
- mova [qcoeffq+ncoeffq*2+ 0], m14
- mova [qcoeffq+ncoeffq*2+16], m13
+ mova [r3q+ncoeffq*2+ 0], m14
+ mova [r3q+ncoeffq*2+16], m13
%ifidn %1, fp_32x32
pabsw m14, m14
pabsw m13, m13
%endif
- pmullw m14, m3 ; dqc[i] = qc[i] * q
- pmullw m13, m3 ; dqc[i] = qc[i] * q
+ pmullw m14, m3 ; r4[i] = r3[i] * q
+ pmullw m13, m3 ; r4[i] = r3[i] * q
%ifidn %1, fp_32x32
psrlw m14, 1
psrlw m13, 1
psignw m14, m9
psignw m13, m10
%endif
- mova [dqcoeffq+ncoeffq*2+ 0], m14
- mova [dqcoeffq+ncoeffq*2+16], m13
+ mova [r4q+ncoeffq*2+ 0], m14
+ mova [r4q+ncoeffq*2+16], m13
pcmpeqw m14, m5 ; m14 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
psubw m6, m7 ; m6 = scan[i] + 1
psubw m11, m7 ; m11 = scan[i] + 1
pandn m14, m6 ; m14 = max(eob)
@@ -351,16 +350,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
add ncoeffq, mmsize
jl .ac_only_loop
-%ifidn %1, fp_32x32
jmp .accumulate_eob
.skip_iter:
- mova [qcoeffq+ncoeffq*2+ 0], m5
- mova [qcoeffq+ncoeffq*2+16], m5
- mova [dqcoeffq+ncoeffq*2+ 0], m5
- mova [dqcoeffq+ncoeffq*2+16], m5
+ mova [r3q+ncoeffq*2+ 0], m5
+ mova [r3q+ncoeffq*2+16], m5
+ mova [r4q+ncoeffq*2+ 0], m5
+ mova [r4q+ncoeffq*2+16], m5
add ncoeffq, mmsize
jl .ac_only_loop
-%endif
.accumulate_eob:
; horizontally accumulate/max eobs and write into [eob] memory pointer
@@ -372,7 +369,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pshuflw m7, m8, 0x1
pmaxsw m8, m7
pextrw r6, m8, 0
- mov [r2], r6
+ mov [r2], r6
RET
; skip-block, i.e. just write all zeroes
@@ -381,19 +378,19 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
movifnidn ncoeffq, ncoeffmp
mov r2, qcoeffmp
mov r3, eobmp
- DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
- lea dqcoeffq, [dqcoeffq+ncoeffq*2]
- lea qcoeffq, [ qcoeffq+ncoeffq*2]
+
+ lea r0q, [r0q+ncoeffq*2]
+ lea r2q, [r2q+ncoeffq*2]
neg ncoeffq
pxor m7, m7
.blank_loop:
- mova [dqcoeffq+ncoeffq*2+ 0], m7
- mova [dqcoeffq+ncoeffq*2+16], m7
- mova [qcoeffq+ncoeffq*2+ 0], m7
- mova [qcoeffq+ncoeffq*2+16], m7
+ mova [r0q+ncoeffq*2+ 0], m7
+ mova [r0q+ncoeffq*2+16], m7
+ mova [r2q+ncoeffq*2+ 0], m7
+ mova [r2q+ncoeffq*2+16], m7
add ncoeffq, mmsize
jl .blank_loop
- mov word [eobq], 0
+ mov word [r3q], 0
RET
%endmacro
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm
deleted file mode 100644
index 0cb35424ed6..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm
+++ /dev/null
@@ -1,370 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm5, XMMWORD PTR [rdi]
- lddqu xmm6, XMMWORD PTR [rdi+1]
- lddqu xmm7, XMMWORD PTR [rdi+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rdi]
- lddqu xmm2, XMMWORD PTR [rdi+1]
- lddqu xmm3, XMMWORD PTR [rdi+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rdi+rdx]
- lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
- lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm7, XMMWORD PTR [rdi+16]
-
- movdqa xmm5, xmm7
- palignr xmm5, xmm4, %2
-
- movdqa xmm6, xmm7
- palignr xmm6, xmm4, (%2+1)
-
- palignr xmm7, xmm4, (%2+2)
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm3, XMMWORD PTR [rdi+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- movdqa xmm4, XMMWORD PTR [rdi+rdx]
- movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-;void int vp9_sad16x16x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x16x3_ssse3) PRIVATE
-sym(vp9_sad16x16x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vp9_sad16x16x3_ssse3_skiptable
-.vp9_sad16x16x3_ssse3_jumptable:
- dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_skiptable:
-
- call .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
-
-.vp9_sad16x16x3_ssse3_aligned_by_15:
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vp9_sad16x16x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void int vp9_sad16x8x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x8x3_ssse3) PRIVATE
-sym(vp9_sad16x8x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vp9_sad16x8x3_ssse3_skiptable
-.vp9_sad16x8x3_ssse3_jumptable:
- dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_skiptable:
-
- call .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
-
-.vp9_sad16x8x3_ssse3_aligned_by_15:
-
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vp9_sad16x8x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 1a9e4e8b6bd..06b8b034a5e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -101,7 +101,7 @@ SECTION .text
pshufd m4, m6, 0x1
movd [r1], m7 ; store sse
paddd m6, m4
- movd rax, m6 ; store sum as return value
+ movd raxd, m6 ; store sum as return value
%else ; mmsize == 8
pshufw m4, m6, 0xe
pshufw m3, m7, 0xe
@@ -113,7 +113,7 @@ SECTION .text
movd [r1], m7 ; store sse
pshufw m4, m6, 0xe
paddd m6, m4
- movd rax, m6 ; store sum as return value
+ movd raxd, m6 ; store sum as return value
%endif
RET
%endmacro
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index b4d2b0ac408..8490bbbdc2e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -314,13 +314,15 @@ unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
return *sse;
}
+// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
- int height, unsigned int *sse)
+ int height, unsigned int *sse, \
+ void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
@@ -342,26 +344,26 @@ unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
unsigned int sse; \
int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
- h, &sse); \
+ h, &sse, NULL, NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
@@ -391,6 +393,7 @@ FNS(ssse3, ssse3);
#undef FNS
#undef FN
+// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
@@ -399,7 +402,8 @@ int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t dst_stride, \
const uint8_t *sec, \
ptrdiff_t sec_stride, \
- int height, unsigned int *sse)
+ int height, unsigned int *sse, \
+ void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
@@ -422,26 +426,30 @@ unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
unsigned int sse; \
int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
- sec, w, h, &sse); \
+ sec, w, h, &sse, NULL, \
+ NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
- sec + 16, w, h, &sse2); \
+ sec + 16, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- sec + 32, w, h, &sse2); \
+ sec + 32, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
- sec + 48, w, h, &sse2); \
+ sec + 48, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
} \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk
index 9414120f64e..c9326eeea69 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk
@@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
VP9_COMMON_SRCS-yes += common/vp9_enums.h
VP9_COMMON_SRCS-yes += common/vp9_idct.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
VP9_COMMON_SRCS-yes += common/vp9_mv.h
VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
@@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -72,6 +74,8 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -80,6 +84,7 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
@@ -124,35 +129,76 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
+# common (msa)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
+
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c
ifeq ($(ARCH_X86_64), yes)
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
endif
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+endif
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
+
+# neon with assembly and intrinsics implementations. If both are available
+# prefer assembly.
+ifeq ($(HAVE_NEON_ASM), yes)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
+else
+ifeq ($(HAVE_NEON), yes)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
+# TODO(johannkoenig): re-enable when chromium build is fixed
+# # https://code.google.com/p/chromium/issues/detail?id=443839
+#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
index d0ca5242ccb..cba15e693e9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
@@ -12,7 +12,8 @@
#include <string.h>
#include "./vpx_config.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/vpx_once.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "./vpx_version.h"
#include "vp9/encoder/vp9_encoder.h"
@@ -33,12 +34,15 @@ struct vp9_extracfg {
vp8e_tuning tuning;
unsigned int cq_level; // constrained quality level
unsigned int rc_max_intra_bitrate_pct;
+ unsigned int rc_max_inter_bitrate_pct;
+ unsigned int gf_cbr_boost_pct;
unsigned int lossless;
unsigned int frame_parallel_decoding_mode;
AQ_MODE aq_mode;
unsigned int frame_periodic_boost;
vpx_bit_depth_t bit_depth;
vp9e_tune_content content;
+ vpx_color_space_t color_space;
};
static struct vp9_extracfg default_extra_cfg = {
@@ -47,19 +51,22 @@ static struct vp9_extracfg default_extra_cfg = {
0, // noise_sensitivity
0, // sharpness
0, // static_thresh
- 0, // tile_columns
+ 6, // tile_columns
0, // tile_rows
7, // arnr_max_frames
5, // arnr_strength
VP8_TUNE_PSNR, // tuning
10, // cq_level
0, // rc_max_intra_bitrate_pct
+ 0, // rc_max_inter_bitrate_pct
+ 0, // gf_cbr_boost_pct
0, // lossless
- 0, // frame_parallel_decoding_mode
+ 1, // frame_parallel_decoding_mode
NO_AQ, // aq_mode
0, // frame_periodic_delta_q
VPX_BITS_8, // Bit depth
- VP9E_CONTENT_DEFAULT // content
+ VP9E_CONTENT_DEFAULT, // content
+ VPX_CS_UNKNOWN, // color space
};
struct vpx_codec_alg_priv {
@@ -76,9 +83,13 @@ struct vpx_codec_alg_priv {
size_t pending_frame_sizes[8];
size_t pending_frame_magnitude;
vpx_image_t preview_img;
+ vpx_enc_frame_flags_t next_frame_flags;
vp8_postproc_cfg_t preview_ppcfg;
vpx_codec_pkt_list_decl(256) pkt_list;
unsigned int fixed_kf_cntr;
+ vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
+ // BufferPool that holds all reference frames.
+ BufferPool *buffer_pool;
};
static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -147,8 +158,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(cfg, g_threads, 64);
RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
- RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
- RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+ RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+ RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
@@ -158,8 +169,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
if (cfg->rc_resize_allowed == 1) {
- RANGE_CHECK(cfg, rc_scaled_width, 1, cfg->g_w);
- RANGE_CHECK(cfg, rc_scaled_height, 1, cfg->g_h);
+ RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
+ RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
}
RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
@@ -202,8 +213,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
ERROR("kf_min_dist not supported in auto mode, use 0 "
"or kf_max_dist instead.");
- RANGE_CHECK_BOOL(extra_cfg, enable_auto_alt_ref);
- RANGE_CHECK(extra_cfg, cpu_used, -16, 16);
+ RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+ RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -288,7 +299,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
cfg->g_bit_depth == VPX_BITS_8) {
ERROR("Codec bit-depth 8 not supported in profile > 1");
}
-
+ RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
return VPX_CODEC_OK;
}
@@ -345,11 +356,12 @@ static int get_image_bps(const vpx_image_t *img) {
}
static vpx_codec_err_t set_encoder_config(
- VP9EncoderConfig *oxcf,
- const vpx_codec_enc_cfg_t *cfg,
- const struct vp9_extracfg *extra_cfg) {
+ VP9EncoderConfig *oxcf,
+ const vpx_codec_enc_cfg_t *cfg,
+ const struct vp9_extracfg *extra_cfg) {
const int is_vbr = cfg->rc_end_usage == VPX_VBR;
oxcf->profile = cfg->g_profile;
+ oxcf->max_threads = (int)cfg->g_threads;
oxcf->width = cfg->g_w;
oxcf->height = cfg->g_h;
oxcf->bit_depth = cfg->g_bit_depth;
@@ -380,6 +392,8 @@ static vpx_codec_err_t set_encoder_config(
// Convert target bandwidth from Kbit/s to Bit/s
oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+ oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+ oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
oxcf->best_allowed_q =
extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
@@ -391,9 +405,15 @@ static vpx_codec_err_t set_encoder_config(
oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
- oxcf->allow_spatial_resampling = cfg->rc_resize_allowed;
- oxcf->scaled_frame_width = cfg->rc_scaled_width;
- oxcf->scaled_frame_height = cfg->rc_scaled_height;
+ oxcf->scaled_frame_width = cfg->rc_scaled_width;
+ oxcf->scaled_frame_height = cfg->rc_scaled_height;
+ if (cfg->rc_resize_allowed == 1) {
+ oxcf->resize_mode =
+ (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0) ?
+ RESIZE_DYNAMIC : RESIZE_FIXED;
+ } else {
+ oxcf->resize_mode = RESIZE_NONE;
+ }
oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
@@ -412,7 +432,7 @@ static vpx_codec_err_t set_encoder_config(
oxcf->speed = abs(extra_cfg->cpu_used);
oxcf->encode_breakout = extra_cfg->static_thresh;
- oxcf->play_alternate = extra_cfg->enable_auto_alt_ref;
+ oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
oxcf->sharpness = extra_cfg->sharpness;
@@ -422,6 +442,7 @@ static vpx_codec_err_t set_encoder_config(
oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
#endif
+ oxcf->color_space = extra_cfg->color_space;
oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
oxcf->arnr_strength = extra_cfg->arnr_strength;
@@ -445,13 +466,13 @@ static vpx_codec_err_t set_encoder_config(
for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
oxcf->ss_target_bitrate[i] = 1000 * cfg->ss_target_bitrate[i];
#if CONFIG_SPATIAL_SVC
- oxcf->ss_play_alternate[i] = cfg->ss_enable_auto_alt_ref[i];
+ oxcf->ss_enable_auto_arf[i] = cfg->ss_enable_auto_alt_ref[i];
#endif
}
} else if (oxcf->ss_number_layers == 1) {
oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
#if CONFIG_SPATIAL_SVC
- oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref;
+ oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
#endif
}
@@ -493,7 +514,7 @@ static vpx_codec_err_t set_encoder_config(
printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
- printf("play_alternate: %d\n", oxcf->play_alternate);
+ printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
printf("Version: %d\n", oxcf->Version);
printf("encode_breakout: %d\n", oxcf->encode_breakout);
printf("error resilient: %d\n", oxcf->error_resilient_mode);
@@ -506,9 +527,16 @@ static vpx_codec_err_t set_encoder_config(
static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
const vpx_codec_enc_cfg_t *cfg) {
vpx_codec_err_t res;
-
- if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
- ERROR("Cannot change width or height after initialization");
+ int force_key = 0;
+
+ if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+ if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+ ERROR("Cannot change width or height after initialization");
+ if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+ (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+ (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+ force_key = 1;
+ }
// Prevent increasing lag_in_frames. This check is stricter than it needs
// to be -- the limit is not increasing past the first lag_in_frames
@@ -522,9 +550,14 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
if (res == VPX_CODEC_OK) {
ctx->cfg = *cfg;
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+ // On profile change, request a key frame
+ force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
vp9_change_config(ctx->cpi, &ctx->oxcf);
}
+ if (force_key)
+ ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
+
return res;
}
@@ -649,6 +682,22 @@ static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
return update_extra_cfg(ctx, &extra_cfg);
}
+static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+ vpx_codec_alg_priv_t *ctx, va_list args) {
+ struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.rc_max_inter_bitrate_pct =
+ CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
+ vpx_codec_alg_priv_t *ctx, va_list args) {
+ struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.gf_cbr_boost_pct =
+ CAST(VP9E_SET_GF_CBR_BOOST_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -691,6 +740,16 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
ctx->priv->enc.total_encoders = 1;
+ priv->buffer_pool =
+ (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+ if (priv->buffer_pool == NULL)
+ return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+#endif
if (ctx->config.enc) {
// Update the reference to the config structure to an internal copy.
@@ -699,7 +758,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
}
priv->extra_cfg = default_extra_cfg;
- vp9_initialize_enc();
+ once(vp9_initialize_enc);
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -709,7 +768,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
priv->oxcf.use_highbitdepth =
(ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
#endif
- priv->cpi = vp9_create_compressor(&priv->oxcf);
+ priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
if (priv->cpi == NULL)
res = VPX_CODEC_MEM_ERROR;
else
@@ -723,6 +782,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
free(ctx->cx_data);
vp9_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+ vpx_free(ctx->buffer_pool);
vpx_free(ctx);
return VPX_CODEC_OK;
}
@@ -861,22 +924,26 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
vpx_codec_err_t res = VPX_CODEC_OK;
VP9_COMP *const cpi = ctx->cpi;
const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+ size_t data_sz;
if (img != NULL) {
res = validate_img(ctx, img);
// TODO(jzern) the checks related to cpi's validity should be treated as a
// failure condition, encoder setup is done fully in init() currently.
- if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) {
+ if (res == VPX_CODEC_OK && cpi != NULL) {
// There's no codec control for multiple alt-refs so check the encoder
// instance for its status to determine the compressed data size.
- ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
- get_image_bps(img) / 8 *
- (cpi->multi_arf_allowed ? 8 : 2);
- if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
-
- ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
- if (ctx->cx_data == NULL) {
- return VPX_CODEC_MEM_ERROR;
+ data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
+ (cpi->multi_arf_allowed ? 8 : 2);
+ if (data_sz < 4096)
+ data_sz = 4096;
+ if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+ ctx->cx_data_sz = data_sz;
+ free(ctx->cx_data);
+ ctx->cx_data = (unsigned char*)malloc(ctx->cx_data_sz);
+ if (ctx->cx_data == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
}
}
}
@@ -921,10 +988,11 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// Store the original flags in to the frame buffer. Will extract the
// key frame flag when we actually encode this frame.
- if (vp9_receive_raw_frame(cpi, flags,
+ if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags,
&sd, dst_time_stamp, dst_end_time_stamp)) {
res = update_error_state(ctx, &cpi->common.error);
}
+ ctx->next_frame_flags = 0;
}
cx_data = ctx->cx_data;
@@ -972,6 +1040,24 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
ctx->pending_frame_magnitude |= size;
cx_data += size;
cx_data_sz -= size;
+
+ if (ctx->output_cx_pkt_cb.output_cx_pkt) {
+ pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+ pkt.data.frame.pts = ticks_to_timebase_units(timebase,
+ dst_time_stamp);
+ pkt.data.frame.duration =
+ (unsigned long)ticks_to_timebase_units(timebase,
+ dst_end_time_stamp - dst_time_stamp);
+ pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+ pkt.data.frame.buf = ctx->pending_cx_data;
+ pkt.data.frame.sz = size;
+ ctx->pending_cx_data = NULL;
+ ctx->pending_cx_data_sz = 0;
+ ctx->pending_frame_count = 0;
+ ctx->pending_frame_magnitude = 0;
+ ctx->output_cx_pkt_cb.output_cx_pkt(
+ &pkt, ctx->output_cx_pkt_cb.user_priv);
+ }
continue;
}
@@ -987,7 +1073,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
ctx->pending_frame_magnitude |= size;
ctx->pending_cx_data_sz += size;
- size += write_superframe_index(ctx);
+ // write the superframe only for the case when
+ if (!ctx->output_cx_pkt_cb.output_cx_pkt)
+ size += write_superframe_index(ctx);
pkt.data.frame.buf = ctx->pending_cx_data;
pkt.data.frame.sz = ctx->pending_cx_data_sz;
ctx->pending_cx_data = NULL;
@@ -999,11 +1087,16 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
pkt.data.frame.sz = size;
}
pkt.data.frame.partition_id = -1;
- vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+ if(ctx->output_cx_pkt_cb.output_cx_pkt)
+ ctx->output_cx_pkt_cb.output_cx_pkt(&pkt, ctx->output_cx_pkt_cb.user_priv);
+ else
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
cx_data += size;
cx_data_sz -= size;
#if CONFIG_SPATIAL_SVC
- if (is_two_pass_svc(cpi)) {
+ if (is_two_pass_svc(cpi) && !ctx->output_cx_pkt_cb.output_cx_pkt) {
vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
int i;
vp9_zero(pkt_sizes);
@@ -1016,7 +1109,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt;
lc->layer_size = 0;
}
+
vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+
vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
}
#endif
@@ -1165,6 +1260,21 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
}
}
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+ if (map) {
+ if (!vp9_get_active_map(ctx->cpi, map->active_map,
+ (int)map->rows, (int)map->cols))
+ return VPX_CODEC_OK;
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ } else {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+}
+
static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
va_list args) {
vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
@@ -1203,7 +1313,9 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
SVC *const svc = &cpi->svc;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
svc->spatial_layer_id = data->spatial_layer_id;
+#endif
svc->temporal_layer_id = data->temporal_layer_id;
// Checks on valid layer_id input.
if (svc->temporal_layer_id < 0 ||
@@ -1217,6 +1329,20 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_get_svc_layer_id(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *);
+ VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
+ SVC *const svc = &cpi->svc;
+
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+ data->spatial_layer_id = svc->spatial_layer_id;
+#endif
+ data->temporal_layer_id = svc->temporal_layer_id;
+
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
va_list args) {
VP9_COMP *const cpi = ctx->cpi;
@@ -1235,6 +1361,16 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
+ (vpx_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *);
+ ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt;
+ ctx->output_cx_pkt_cb.user_priv = cbp->user_priv;
+
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -1242,6 +1378,13 @@ static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{VP8_COPY_REFERENCE, ctrl_copy_reference},
{VP8E_UPD_ENTROPY, ctrl_update_entropy},
@@ -1266,20 +1409,30 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{VP8E_SET_TUNING, ctrl_set_tuning},
{VP8E_SET_CQ_LEVEL, ctrl_set_cq_level},
{VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct},
+ {VP9E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct},
+ {VP9E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct},
{VP9E_SET_LOSSLESS, ctrl_set_lossless},
{VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode},
{VP9E_SET_AQ_MODE, ctrl_set_aq_mode},
{VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost},
{VP9E_SET_SVC, ctrl_set_svc},
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
{VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters},
+ {VP9E_REGISTER_CX_CALLBACK, ctrl_register_cx_callback},
+#endif
{VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id},
{VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content},
+ {VP9E_SET_COLOR_SPACE, ctrl_set_color_space},
{VP9E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity},
// Getters
{VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer},
{VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64},
{VP9_GET_REFERENCE, ctrl_get_reference},
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+ {VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id},
+#endif
+ {VP9E_GET_ACTIVEMAP, ctrl_get_active_map},
{ -1, NULL},
};
@@ -1289,7 +1442,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
0,
{ // NOLINT
0, // g_usage
- 0, // g_threads
+ 8, // g_threads
0, // g_profile
320, // g_width
@@ -1307,21 +1460,19 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
0, // rc_dropframe_thresh
0, // rc_resize_allowed
- 1, // rc_scaled_width
- 1, // rc_scaled_height
+ 0, // rc_scaled_width
+ 0, // rc_scaled_height
60, // rc_resize_down_thresold
30, // rc_resize_up_thresold
VPX_VBR, // rc_end_usage
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
{NULL, 0}, // rc_twopass_stats_in
{NULL, 0}, // rc_firstpass_mb_stats_in
-#endif
256, // rc_target_bandwidth
0, // rc_min_quantizer
63, // rc_max_quantizer
- 100, // rc_undershoot_pct
- 100, // rc_overshoot_pct
+ 25, // rc_undershoot_pct
+ 25, // rc_overshoot_pct
6000, // rc_max_buffer_size
4000, // rc_buffer_initial_size
@@ -1344,9 +1495,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
{0}, // ts_rate_decimator
0, // ts_periodicity
{0}, // ts_layer_id
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
- "vp8.fpf" // first pass filename
-#endif
}
},
};
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
index 85e32d351e9..ff76204d822 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
@@ -11,13 +11,16 @@
#include <stdlib.h>
#include <string.h>
+#include "./vpx_config.h"
#include "./vpx_version.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_frame_buffers.h"
+#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_decodeframe.h"
@@ -29,20 +32,45 @@
typedef vpx_codec_stream_info_t vp9_stream_info_t;
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+ int fb_idx;
+ vpx_image_t img;
+} cache_frame;
+
struct vpx_codec_alg_priv {
vpx_codec_priv_t base;
vpx_codec_dec_cfg_t cfg;
vp9_stream_info_t si;
- struct VP9Decoder *pbi;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
vpx_decrypt_cb decrypt_cb;
- void *decrypt_state;
+ void *decrypt_state;
vpx_image_t img;
int img_avail;
int flushed;
int invert_tile_order;
+ int last_show_frame; // Index of last output frame.
+ int byte_alignment;
+
+ // Frame parallel related.
int frame_parallel_decode; // frame-based threading.
+ VP9Worker *frame_workers;
+ int num_frame_workers;
+ int next_submit_worker_id;
+ int last_submit_worker_id;
+ int next_output_worker_id;
+ int available_threads;
+ cache_frame frame_cache[FRAME_CACHE_SIZE];
+ int frame_cache_write;
+ int frame_cache_read;
+ int num_cache_frames;
+ int need_resync; // wait for key/intra-only frame
+ // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+ BufferPool *buffer_pool;
// External frame buffer info to save for VP9 common.
void *ext_priv; // Private data associated with the external frame buffers.
@@ -64,13 +92,12 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
-
priv->si.sz = sizeof(priv->si);
priv->flushed = 0;
+ // Only do frame parallel decode when threads > 1.
priv->frame_parallel_decode =
- (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
- priv->frame_parallel_decode = 0; // Disable for now
-
+ (ctx->config.dec && (ctx->config.dec->threads > 1) &&
+ (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
if (ctx->config.dec) {
priv->cfg = *ctx->config.dec;
ctx->config.dec = &priv->cfg;
@@ -81,24 +108,48 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
}
static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
- if (ctx->pbi) {
- vp9_decoder_remove(ctx->pbi);
- ctx->pbi = NULL;
+ if (ctx->frame_workers != NULL) {
+ int i;
+ for (i = 0; i < ctx->num_frame_workers; ++i) {
+ VP9Worker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ vp9_get_worker_interface()->end(worker);
+ vp9_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
+#endif
+ vp9_decoder_remove(frame_worker_data->pbi);
+ vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+ pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+ vpx_free(frame_worker_data);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
}
- vpx_free(ctx);
+ if (ctx->buffer_pool) {
+ vp9_free_ref_frame_buffers(ctx->buffer_pool);
+ vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+ }
+ vpx_free(ctx->frame_workers);
+ vpx_free(ctx->buffer_pool);
+ vpx_free(ctx);
return VPX_CODEC_OK;
}
static int parse_bitdepth_colorspace_sampling(
BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
- const int sRGB = 7;
- int colorspace;
+ vpx_color_space_t color_space;
if (profile >= PROFILE_2)
rb->bit_offset += 1; // Bit-depth 10 or 12.
- colorspace = vp9_rb_read_literal(rb, 3);
- if (colorspace != sRGB) {
+ color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3);
+ if (color_space != VPX_CS_SRGB) {
rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range.
if (profile == PROFILE_1 || profile == PROFILE_3) {
rb->bit_offset += 2; // subsampling x/y.
@@ -146,7 +197,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
if (frame_marker != VP9_FRAME_MARKER)
return VPX_CODEC_UNSUP_BITSTREAM;
- if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM;
+ if (profile >= MAX_PROFILES)
+ return VPX_CODEC_UNSUP_BITSTREAM;
+
+ if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+ return VPX_CODEC_UNSUP_BITSTREAM;
if (vp9_rb_read_bit(&rb)) { // show an existing frame
vp9_rb_read_literal(&rb, 3); // Frame buffer to show.
@@ -206,32 +261,45 @@ static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+ const char *const error) {
+ ctx->base.err_detail = error;
+}
+
static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
const struct vpx_internal_error_info *error) {
if (error->error_code)
- ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+ set_error_detail(ctx, error->has_detail ? error->detail : NULL);
return error->error_code;
}
static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
- VP9_COMMON *const cm = &ctx->pbi->common;
+ int i;
- cm->new_fb_idx = -1;
+ for (i = 0; i < ctx->num_frame_workers; ++i) {
+ VP9Worker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
- if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
- cm->get_fb_cb = ctx->get_ext_fb_cb;
- cm->release_fb_cb = ctx->release_ext_fb_cb;
- cm->cb_priv = ctx->ext_priv;
- } else {
- cm->get_fb_cb = vp9_get_frame_buffer;
- cm->release_fb_cb = vp9_release_frame_buffer;
+ cm->new_fb_idx = INVALID_IDX;
+ cm->byte_alignment = ctx->byte_alignment;
- if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to initialize internal frame buffers");
+ if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+ pool->get_fb_cb = ctx->get_ext_fb_cb;
+ pool->release_fb_cb = ctx->release_ext_fb_cb;
+ pool->cb_priv = ctx->ext_priv;
+ } else {
+ pool->get_fb_cb = vp9_get_frame_buffer;
+ pool->release_fb_cb = vp9_release_frame_buffer;
- cm->cb_priv = &cm->int_frame_buffers;
+ if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to initialize internal frame buffers");
+
+ pool->cb_priv = &pool->int_frame_buffers;
+ }
}
}
@@ -250,14 +318,127 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
flags->noise_level = ctx->postproc_cfg.noise_level;
}
-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
- ctx->pbi = vp9_decoder_create();
- if (ctx->pbi == NULL)
- return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+ const uint8_t *data = frame_worker_data->data;
+ (void)arg2;
+
+ frame_worker_data->result =
+ vp9_receive_compressed_data(frame_worker_data->pbi,
+ frame_worker_data->data_size,
+ &data);
+ frame_worker_data->data_end = data;
+
+ if (frame_worker_data->pbi->frame_parallel_decode) {
+ // In frame parallel decoding, a worker thread must successfully decode all
+ // the compressed data.
+ if (frame_worker_data->result != 0 ||
+ frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+ VP9Worker *const worker = frame_worker_data->pbi->frame_worker_owner;
+ BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+ // Signal all the other threads that are waiting for this frame.
+ vp9_frameworker_lock_stats(worker);
+ frame_worker_data->frame_context_ready = 1;
+ lock_buffer_pool(pool);
+ frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+ unlock_buffer_pool(pool);
+ frame_worker_data->pbi->need_resync = 1;
+ vp9_frameworker_signal_stats(worker);
+ vp9_frameworker_unlock_stats(worker);
+ return 0;
+ }
+ } else if (frame_worker_data->result != 0) {
+ // Check decode result in serial decode.
+ frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+ frame_worker_data->pbi->need_resync = 1;
+ }
+ return !frame_worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+ int i;
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+
+ ctx->last_show_frame = -1;
+ ctx->next_submit_worker_id = 0;
+ ctx->last_submit_worker_id = 0;
+ ctx->next_output_worker_id = 0;
+ ctx->frame_cache_read = 0;
+ ctx->frame_cache_write = 0;
+ ctx->num_cache_frames = 0;
+ ctx->need_resync = 1;
+ ctx->num_frame_workers =
+ (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+ if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+ ctx->num_frame_workers = MAX_DECODE_THREADS;
+ ctx->available_threads = ctx->num_frame_workers;
+ ctx->flushed = 0;
- ctx->pbi->max_threads = ctx->cfg.threads;
- ctx->pbi->inv_tile_order = ctx->invert_tile_order;
- ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+ ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+ if (ctx->buffer_pool == NULL)
+ return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+ set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+ return VPX_CODEC_MEM_ERROR;
+ }
+#endif
+
+ ctx->frame_workers = (VP9Worker *)
+ vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+ if (ctx->frame_workers == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_workers");
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ for (i = 0; i < ctx->num_frame_workers; ++i) {
+ VP9Worker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *frame_worker_data = NULL;
+ winterface->init(worker);
+ worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+ if (worker->data1 == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data");
+ return VPX_CODEC_MEM_ERROR;
+ }
+ frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+ if (frame_worker_data->pbi == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data");
+ return VPX_CODEC_MEM_ERROR;
+ }
+ frame_worker_data->pbi->frame_worker_owner = worker;
+ frame_worker_data->worker_id = i;
+ frame_worker_data->scratch_buffer = NULL;
+ frame_worker_data->scratch_buffer_size = 0;
+ frame_worker_data->frame_context_ready = 0;
+ frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+ return VPX_CODEC_MEM_ERROR;
+ }
+#endif
+ // If decoding in serial mode, FrameWorker thread could create tile worker
+ // thread or loopfilter thread.
+ frame_worker_data->pbi->max_threads =
+ (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+ frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+ frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+ frame_worker_data->pbi->common.frame_parallel_decode =
+ ctx->frame_parallel_decode;
+ worker->hook = (VP9WorkerHook)frame_worker_hook;
+ if (!winterface->reset(worker)) {
+ set_error_detail(ctx, "Frame Worker thread creation failed");
+ return VPX_CODEC_MEM_ERROR;
+ }
+ }
// If postprocessing was enabled by the application and a
// configuration has not been provided, default it.
@@ -266,20 +447,24 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) {
set_default_ppflags(&ctx->postproc_cfg);
init_buffer_callbacks(ctx);
+
+ return VPX_CODEC_OK;
+}
+
+static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
+ const VP9Decoder *const pbi) {
+ // Clear resync flag if worker got a key frame or intra only frame.
+ if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+ (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+ ctx->need_resync = 0;
}
static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
const uint8_t **data, unsigned int data_sz,
void *user_priv, int64_t deadline) {
- YV12_BUFFER_CONFIG sd;
- vp9_ppflags_t flags = {0, 0, 0};
- VP9_COMMON *cm = NULL;
-
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
(void)deadline;
- vp9_zero(sd);
- ctx->img_avail = 0;
-
// Determine the stream parameters. Note that we rely on peek_si to
// validate that we have a buffer that does not wrap around the top
// of the heap.
@@ -295,36 +480,104 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_ERROR;
}
- // Initialize the decoder instance on the first frame
- if (ctx->pbi == NULL) {
- init_decoder(ctx);
- if (ctx->pbi == NULL)
- return VPX_CODEC_ERROR;
- }
-
- // Set these even if already initialized. The caller may have changed the
- // decrypt config between frames.
- ctx->pbi->decrypt_cb = ctx->decrypt_cb;
- ctx->pbi->decrypt_state = ctx->decrypt_state;
+ if (!ctx->frame_parallel_decode) {
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->data = *data;
+ frame_worker_data->data_size = data_sz;
+ frame_worker_data->user_priv = user_priv;
+ frame_worker_data->received_frame = 1;
- cm = &ctx->pbi->common;
+ // Set these even if already initialized. The caller may have changed the
+ // decrypt config between frames.
+ frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+ frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
- if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
- return update_error_state(ctx, &cm->error);
+ worker->had_error = 0;
+ winterface->execute(worker);
- if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
- set_ppflags(ctx, &flags);
+ // Update data pointer after decode.
+ *data = frame_worker_data->data_end;
- if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
- return update_error_state(ctx, &cm->error);
+ if (worker->had_error)
+ return update_error_state(ctx, &frame_worker_data->pbi->common.error);
- yuvconfig2image(&ctx->img, &sd, user_priv);
- ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
- ctx->img_avail = 1;
+ check_resync(ctx, frame_worker_data->pbi);
+ } else {
+ VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ // Copy context from last worker thread to next worker thread.
+ if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+ vp9_frameworker_copy_context(
+ &ctx->frame_workers[ctx->next_submit_worker_id],
+ &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+ frame_worker_data->pbi->ready_for_new_data = 0;
+ // Copy the compressed data into worker's internal buffer.
+ // TODO(hkuang): Will all the workers allocate the same size
+ // as the size of the first intra frame be better? This will
+ // avoid too many deallocate and allocate.
+ if (frame_worker_data->scratch_buffer_size < data_sz) {
+ frame_worker_data->scratch_buffer =
+ (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+ if (frame_worker_data->scratch_buffer == NULL) {
+ set_error_detail(ctx, "Failed to reallocate scratch buffer");
+ return VPX_CODEC_MEM_ERROR;
+ }
+ frame_worker_data->scratch_buffer_size = data_sz;
+ }
+ frame_worker_data->data_size = data_sz;
+ memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+ frame_worker_data->frame_decoded = 0;
+ frame_worker_data->frame_context_ready = 0;
+ frame_worker_data->received_frame = 1;
+ frame_worker_data->data = frame_worker_data->scratch_buffer;
+ frame_worker_data->user_priv = user_priv;
+
+ if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+ ctx->last_submit_worker_id =
+ (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+ ctx->next_submit_worker_id =
+ (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+ --ctx->available_threads;
+ worker->had_error = 0;
+ winterface->launch(worker);
+ }
return VPX_CODEC_OK;
}
+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+ YV12_BUFFER_CONFIG sd;
+ vp9_ppflags_t flags = {0, 0, 0};
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ ctx->next_output_worker_id =
+ (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+ // TODO(hkuang): Add worker error handling here.
+ winterface->sync(worker);
+ frame_worker_data->received_frame = 0;
+ ++ctx->available_threads;
+
+ check_resync(ctx, frame_worker_data->pbi);
+
+ if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+ VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+ yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+ frame_worker_data->user_priv);
+ ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+ frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+ ctx->frame_cache_write =
+ (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+ ++ctx->num_cache_frames;
+ }
+}
+
static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *data, unsigned int data_sz,
void *user_priv, long deadline) {
@@ -342,6 +595,13 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
// Reset flushed when receiving a valid frame.
ctx->flushed = 0;
+ // Initialize the decoder workers on the first frame.
+ if (ctx->frame_workers == NULL) {
+ const vpx_codec_err_t res = init_decoder(ctx);
+ if (res != VPX_CODEC_OK)
+ return res;
+ }
+
res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
ctx->decrypt_cb, ctx->decrypt_state);
if (res != VPX_CODEC_OK)
@@ -358,30 +618,46 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
for (i = 0; i < frame_count; ++i) {
const uint8_t *data_start_copy = data_start;
const uint32_t frame_size = frame_sizes[i];
- vpx_codec_err_t res;
if (data_start < data
|| frame_size > (uint32_t) (data_end - data_start)) {
- ctx->base.err_detail = "Invalid frame size in index";
+ set_error_detail(ctx, "Invalid frame size in index");
return VPX_CODEC_CORRUPT_FRAME;
}
+ if (ctx->available_threads == 0) {
+ // No more threads for decoding. Wait until the next output worker
+ // finishes decoding. Then copy the decoded frame into cache.
+ if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+ wait_worker_and_cache_frame(ctx);
+ } else {
+ // TODO(hkuang): Add unit test to test this path.
+ set_error_detail(ctx, "Frame output cache is full.");
+ return VPX_CODEC_ERROR;
+ }
+ }
+
res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
deadline);
if (res != VPX_CODEC_OK)
return res;
-
data_start += frame_size;
}
} else {
- res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
+ if (ctx->available_threads == 0) {
+ // No more threads for decoding. Wait until the next output worker
+ // finishes decoding. Then copy the decoded frame into cache.
+ if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+ wait_worker_and_cache_frame(ctx);
+ } else {
+ // TODO(hkuang): Add unit test to test this path.
+ set_error_detail(ctx, "Frame output cache is full.");
+ return VPX_CODEC_ERROR;
+ }
+ }
+
+ res = decode_one(ctx, &data, data_sz, user_priv, deadline);
if (res != VPX_CODEC_OK)
return res;
-
- // Extra data detected after the frame.
- if (data_start < data_end - 1) {
- ctx->base.err_detail = "Fail to decode frame in parallel mode";
- return VPX_CODEC_INCAPABLE;
- }
}
} else {
// Decode in serial mode.
@@ -394,7 +670,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
vpx_codec_err_t res;
if (data_start < data
|| frame_size > (uint32_t) (data_end - data_start)) {
- ctx->base.err_detail = "Invalid frame size in index";
+ set_error_detail(ctx, "Invalid frame size in index");
return VPX_CODEC_CORRUPT_FRAME;
}
@@ -425,24 +701,89 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
}
}
- return VPX_CODEC_OK;
+ return res;
+}
+
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+ RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+ // Decrease reference count of last output frame in frame parallel mode.
+ if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+ BufferPool *const pool = ctx->buffer_pool;
+ lock_buffer_pool(pool);
+ decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+ }
}
static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
vpx_codec_iter_t *iter) {
vpx_image_t *img = NULL;
- if (ctx->img_avail) {
- // iter acts as a flip flop, so an image is only returned on the first
- // call to get_frame.
- if (!(*iter)) {
- img = &ctx->img;
- *iter = img;
- }
+ // Only return frame when all the cpu are busy or
+ // application fluhsed the decoder in frame parallel decode.
+ if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+ !ctx->flushed) {
+ return NULL;
+ }
+
+ // Output the frames in the cache first.
+ if (ctx->num_cache_frames > 0) {
+ release_last_output_frame(ctx);
+ ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+ if (ctx->need_resync)
+ return NULL;
+ img = &ctx->frame_cache[ctx->frame_cache_read].img;
+ ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+ --ctx->num_cache_frames;
+ return img;
}
- ctx->img_avail = 0;
- return img;
+ // iter acts as a flip flop, so an image is only returned on the first
+ // call to get_frame.
+ if (*iter == NULL && ctx->frame_workers != NULL) {
+ do {
+ YV12_BUFFER_CONFIG sd;
+ vp9_ppflags_t flags = {0, 0, 0};
+ const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+ VP9Worker *const worker =
+ &ctx->frame_workers[ctx->next_output_worker_id];
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ ctx->next_output_worker_id =
+ (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+ if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+ set_ppflags(ctx, &flags);
+ // Wait for the frame from worker thread.
+ if (winterface->sync(worker)) {
+ // Check if worker has received any frames.
+ if (frame_worker_data->received_frame == 1) {
+ ++ctx->available_threads;
+ frame_worker_data->received_frame = 0;
+ check_resync(ctx, frame_worker_data->pbi);
+ }
+ if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+ VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ release_last_output_frame(ctx);
+ ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+ if (ctx->need_resync)
+ return NULL;
+ yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+ ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+ img = &ctx->img;
+ return img;
+ }
+ } else {
+ // Decoding failed. Release the worker thread.
+ frame_worker_data->received_frame = 0;
+ ++ctx->available_threads;
+ ctx->need_resync = 1;
+ if (ctx->flushed != 1)
+ return NULL;
+ }
+ } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+ }
+ return NULL;
}
static vpx_codec_err_t decoder_set_fb_fn(
@@ -451,7 +792,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
if (cb_get == NULL || cb_release == NULL) {
return VPX_CODEC_INVALID_PARAM;
- } else if (ctx->pbi == NULL) {
+ } else if (ctx->frame_workers == NULL) {
// If the decoder has already been initialized, do not accept changes to
// the frame buffer functions.
ctx->get_ext_fb_cb = cb_get;
@@ -467,12 +808,19 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
va_list args) {
vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
+ // Only support this function in serial decode.
+ if (ctx->frame_parallel_decode) {
+ set_error_detail(ctx, "Not supported in frame parallel decode");
+ return VPX_CODEC_INCAPABLE;
+ }
+
if (data) {
vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
YV12_BUFFER_CONFIG sd;
-
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
- return vp9_set_reference_dec(&ctx->pbi->common,
+ return vp9_set_reference_dec(&frame_worker_data->pbi->common,
(VP9_REFFRAME)frame->frame_type, &sd);
} else {
return VPX_CODEC_INVALID_PARAM;
@@ -483,13 +831,19 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
va_list args) {
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+ // Only support this function in serial decode.
+ if (ctx->frame_parallel_decode) {
+ set_error_detail(ctx, "Not supported in frame parallel decode");
+ return VPX_CODEC_INCAPABLE;
+ }
+
if (data) {
- vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
YV12_BUFFER_CONFIG sd;
-
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
-
- return vp9_copy_reference_dec(ctx->pbi,
+ return vp9_copy_reference_dec(frame_worker_data->pbi,
(VP9_REFFRAME)frame->frame_type, &sd);
} else {
return VPX_CODEC_INVALID_PARAM;
@@ -500,10 +854,18 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
va_list args) {
vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
+ // Only support this function in serial decode.
+ if (ctx->frame_parallel_decode) {
+ set_error_detail(ctx, "Not supported in frame parallel decode");
+ return VPX_CODEC_INCAPABLE;
+ }
+
if (data) {
- YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx);
+ YV12_BUFFER_CONFIG* fb;
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
if (fb == NULL) return VPX_CODEC_ERROR;
-
yuvconfig2image(&data->img, fb, NULL);
return VPX_CODEC_OK;
} else {
@@ -541,65 +903,122 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const update_info = va_arg(args, int *);
+ // Only support this function in serial decode.
+ if (ctx->frame_parallel_decode) {
+ set_error_detail(ctx, "Not supported in frame parallel decode");
+ return VPX_CODEC_INCAPABLE;
+ }
+
if (update_info) {
- if (ctx->pbi)
- *update_info = ctx->pbi->refresh_frame_flags;
- else
+ if (ctx->frame_workers) {
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ *update_info = frame_worker_data->pbi->refresh_frame_flags;
+ return VPX_CODEC_OK;
+ } else {
return VPX_CODEC_ERROR;
- return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
+ }
}
-}
+ return VPX_CODEC_INVALID_PARAM;
+}
static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *corrupted = va_arg(args, int *);
- if (corrupted != NULL && ctx->pbi != NULL) {
- const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show;
- if (frame == NULL) return VPX_CODEC_ERROR;
- *corrupted = frame->corrupted;
- return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
+ if (corrupted) {
+ if (ctx->frame_workers) {
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ RefCntBuffer *const frame_bufs =
+ frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+ if (frame_worker_data->pbi->common.frame_to_show == NULL)
+ return VPX_CODEC_ERROR;
+ *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+ return VPX_CODEC_OK;
+ } else {
+ return VPX_CODEC_ERROR;
+ }
+ }
+
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const frame_size = va_arg(args, int *);
+
+ // Only support this function in serial decode.
+ if (ctx->frame_parallel_decode) {
+ set_error_detail(ctx, "Not supported in frame parallel decode");
+ return VPX_CODEC_INCAPABLE;
+ }
+
+ if (frame_size) {
+ if (ctx->frame_workers) {
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+ frame_size[0] = cm->width;
+ frame_size[1] = cm->height;
+ return VPX_CODEC_OK;
+ } else {
+ return VPX_CODEC_ERROR;
+ }
}
+
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *const display_size = va_arg(args, int *);
+ // Only support this function in serial decode.
+ if (ctx->frame_parallel_decode) {
+ set_error_detail(ctx, "Not supported in frame parallel decode");
+ return VPX_CODEC_INCAPABLE;
+ }
+
if (display_size) {
- if (ctx->pbi) {
- const VP9_COMMON *const cm = &ctx->pbi->common;
+ if (ctx->frame_workers) {
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
display_size[0] = cm->display_width;
display_size[1] = cm->display_height;
+ return VPX_CODEC_OK;
} else {
return VPX_CODEC_ERROR;
}
- return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
va_list args) {
unsigned int *const bit_depth = va_arg(args, unsigned int *);
+ VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
if (bit_depth) {
- if (ctx->pbi) {
- const VP9_COMMON *const cm = &ctx->pbi->common;
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
*bit_depth = cm->bit_depth;
return VPX_CODEC_OK;
} else {
return VPX_CODEC_ERROR;
}
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
@@ -616,6 +1035,29 @@ static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int legacy_byte_alignment = 0;
+ const int min_byte_alignment = 32;
+ const int max_byte_alignment = 1024;
+ const int byte_alignment = va_arg(args, int);
+
+ if (byte_alignment != legacy_byte_alignment &&
+ (byte_alignment < min_byte_alignment ||
+ byte_alignment > max_byte_alignment ||
+ (byte_alignment & (byte_alignment - 1)) != 0))
+ return VPX_CODEC_INVALID_PARAM;
+
+ ctx->byte_alignment = byte_alignment;
+ if (ctx->frame_workers) {
+ VP9Worker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+ }
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{VP8_COPY_REFERENCE, ctrl_copy_reference},
@@ -628,6 +1070,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options},
{VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order},
{VPXD_SET_DECRYPTOR, ctrl_set_decryptor},
+ {VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment},
// Getters
{VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates},
@@ -635,6 +1078,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{VP9_GET_REFERENCE, ctrl_get_reference},
{VP9D_GET_DISPLAY_SIZE, ctrl_get_display_size},
{VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth},
+ {VP9D_GET_FRAME_SIZE, ctrl_get_frame_size},
{ -1, NULL},
};
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h
index 00fbfdd7dbb..e585aa14725 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h
@@ -34,6 +34,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
bps = 12;
}
}
+ img->cs = yv12->color_space;
img->bit_depth = 8;
img->w = yv12->y_stride;
img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
@@ -92,6 +93,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
yv12->y_stride = img->stride[VPX_PLANE_Y];
yv12->uv_stride = img->stride[VPX_PLANE_U];
+ yv12->color_space = img->cs;
#if CONFIG_VP9_HIGHBITDEPTH
if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
index e72cb0024f5..7359b2de05d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
@@ -24,13 +24,17 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
VP9_CX_SRCS-yes += encoder/vp9_cost.h
VP9_CX_SRCS-yes += encoder/vp9_cost.c
VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
+VP9_CX_SRCS-yes += encoder/vp9_ethread.h
+VP9_CX_SRCS-yes += encoder/vp9_ethread.c
VP9_CX_SRCS-yes += encoder/vp9_extend.c
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_fastssim.c
VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
VP9_CX_SRCS-yes += encoder/vp9_block.h
VP9_CX_SRCS-yes += encoder/vp9_writer.h
@@ -59,12 +63,12 @@ VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
VP9_CX_SRCS-yes += encoder/vp9_encoder.c
VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_psnrhvs.c
VP9_CX_SRCS-yes += encoder/vp9_quantize.c
VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
VP9_CX_SRCS-yes += encoder/vp9_rd.c
VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
-VP9_CX_SRCS-yes += encoder/vp9_sad.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
@@ -76,6 +80,8 @@ VP9_CX_SRCS-yes += encoder/vp9_resize.c
VP9_CX_SRCS-yes += encoder/vp9_resize.h
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
VP9_CX_SRCS-yes += encoder/vp9_variance.c
@@ -85,6 +91,8 @@ VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c
VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
@@ -95,34 +103,39 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
+endif
endif
ifeq ($(ARCH_X86_64),yes)
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
endif
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
@@ -133,10 +146,12 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk
index 1fcb36f668c..c105adb7967 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk
@@ -21,14 +21,14 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
VP9_DX_SRCS-yes += decoder/vp9_reader.h
VP9_DX_SRCS-yes += decoder/vp9_reader.c
VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
+VP9_DX_SRCS-yes += decoder/vp9_dthread.c
+VP9_DX_SRCS-yes += decoder/vp9_dthread.h
VP9_DX_SRCS-yes += decoder/vp9_decoder.c
VP9_DX_SRCS-yes += decoder/vp9_decoder.h
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc b/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc
index 88859206545..e4707ba1082 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc
@@ -1,5 +1,6 @@
text vpx_codec_enc_config_default
text vpx_codec_enc_config_set
+text vpx_codec_enc_init_multi_ver
text vpx_codec_enc_init_ver
text vpx_codec_encode
text vpx_codec_get_cx_data
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h b/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
index cbfffd0af2a..7380fcc7e24 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
@@ -425,10 +425,18 @@ struct vpx_internal_error_info {
jmp_buf jmp;
};
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
void vpx_internal_error(struct vpx_internal_error_info *info,
vpx_codec_err_t error,
const char *fmt,
- ...);
+ ...) CLANG_ANALYZER_NORETURN;
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
index fa3409c6983..e711cf909ba 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
@@ -44,8 +44,6 @@ _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
#define SVC_REFERENCE_FRAMES 8
#define SUPERFRAME_SLOTS (8)
#define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
-#define OPTION_BUFFER_SIZE 1024
-#define COMPONENTS 4 // psnr & sse statistics maintained for total, y, u, v
#define MAX_QUANTIZER 63
@@ -81,52 +79,26 @@ typedef struct FrameData {
struct FrameData *next;
} FrameData;
-typedef struct SvcInternal {
- char options[OPTION_BUFFER_SIZE]; // set by vpx_svc_set_options
-
- // values extracted from option, quantizers
- vpx_svc_extra_cfg_t svc_params;
- int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
- int bitrates[VPX_SS_MAX_LAYERS];
-
- // accumulated statistics
- double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V
- uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
- uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
-
- // codec encoding values
- int width; // width of highest layer
- int height; // height of highest layer
- int kf_dist; // distance between keyframes
-
- // state variables
- int psnr_pkt_received;
- int layer;
- int use_multiple_frame_contexts;
-
- char message_buffer[2048];
- vpx_codec_ctx_t *codec_ctx;
-} SvcInternal;
-
-static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
+static SvcInternal_t *get_svc_internal(SvcContext *svc_ctx) {
if (svc_ctx == NULL) return NULL;
if (svc_ctx->internal == NULL) {
- SvcInternal *const si = (SvcInternal *)malloc(sizeof(*si));
+ SvcInternal_t *const si = (SvcInternal_t *)malloc(sizeof(*si));
if (si != NULL) {
memset(si, 0, sizeof(*si));
}
svc_ctx->internal = si;
}
- return (SvcInternal *)svc_ctx->internal;
+ return (SvcInternal_t *)svc_ctx->internal;
}
-static const SvcInternal *get_const_svc_internal(const SvcContext *svc_ctx) {
+static const SvcInternal_t *get_const_svc_internal(
+ const SvcContext *svc_ctx) {
if (svc_ctx == NULL) return NULL;
- return (const SvcInternal *)svc_ctx->internal;
+ return (const SvcInternal_t *)svc_ctx->internal;
}
static void svc_log_reset(SvcContext *svc_ctx) {
- SvcInternal *const si = (SvcInternal *)svc_ctx->internal;
+ SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal;
si->message_buffer[0] = '\0';
}
@@ -135,7 +107,7 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level,
char buf[512];
int retval = 0;
va_list ap;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+ SvcInternal_t *const si = get_svc_internal(svc_ctx);
if (level > svc_ctx->log_level) {
return retval;
@@ -233,7 +205,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
char *option_name;
char *option_value;
char *input_ptr;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+ SvcInternal_t *const si = get_svc_internal(svc_ctx);
vpx_codec_err_t res = VPX_CODEC_OK;
int i, alt_ref_enabled = 0;
@@ -315,8 +287,9 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
return res;
}
-vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
- SvcInternal *const si = get_svc_internal(svc_ctx);
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx,
+ const char *options) {
+ SvcInternal_t *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || options == NULL || si == NULL) {
return VPX_CODEC_INVALID_PARAM;
}
@@ -328,7 +301,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
void assign_layer_bitrates(const SvcContext *svc_ctx,
vpx_codec_enc_cfg_t *const enc_cfg) {
int i;
- const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+ const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
if (si->bitrates[0] != 0) {
enc_cfg->rc_target_bitrate = 0;
@@ -364,7 +337,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
vpx_codec_enc_cfg_t *enc_cfg) {
vpx_codec_err_t res;
int i;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+ SvcInternal_t *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
enc_cfg == NULL) {
return VPX_CODEC_INVALID_PARAM;
@@ -454,13 +427,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
* Encode a frame into multiple layers
* Create a superframe containing the individual layers
*/
-vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
- struct vpx_image *rawimg, vpx_codec_pts_t pts,
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+ vpx_codec_ctx_t *codec_ctx,
+ struct vpx_image *rawimg,
+ vpx_codec_pts_t pts,
int64_t duration, int deadline) {
vpx_codec_err_t res;
vpx_codec_iter_t iter;
const vpx_codec_cx_pkt_t *cx_pkt;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+ SvcInternal_t *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
return VPX_CODEC_INVALID_PARAM;
}
@@ -524,7 +499,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
}
const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
- const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+ const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
if (svc_ctx == NULL || si == NULL) return NULL;
return si->message_buffer;
}
@@ -544,7 +519,7 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
double mse[COMPONENTS];
double y_scale;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+ SvcInternal_t *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || si == NULL) return NULL;
svc_log_reset(svc_ctx);
@@ -595,11 +570,11 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
}
void vpx_svc_release(SvcContext *svc_ctx) {
- SvcInternal *si;
+ SvcInternal_t *si;
if (svc_ctx == NULL) return;
// do not use get_svc_internal as it will unnecessarily allocate an
- // SvcInternal if it was not already allocated
- si = (SvcInternal *)svc_ctx->internal;
+ // SvcInternal_t if it was not already allocated
+ si = (SvcInternal_t *)svc_ctx->internal;
if (si != NULL) {
free(si);
svc_ctx->internal = NULL;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
index 61b5f4ba0cc..cf791bdeb56 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
@@ -41,6 +41,36 @@ typedef struct {
void *internal;
} SvcContext;
+#define OPTION_BUFFER_SIZE 1024
+#define COMPONENTS 4 // psnr & sse statistics maintained for total, y, u, v
+
+typedef struct SvcInternal {
+ char options[OPTION_BUFFER_SIZE]; // set by vpx_svc_set_options
+
+ // values extracted from option, quantizers
+ vpx_svc_extra_cfg_t svc_params;
+ int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+ int bitrates[VPX_SS_MAX_LAYERS];
+
+ // accumulated statistics
+ double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V
+ uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
+ uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
+
+ // codec encoding values
+ int width; // width of highest layer
+ int height; // height of highest layer
+ int kf_dist; // distance between keyframes
+
+ // state variables
+ int psnr_pkt_received;
+ int layer;
+ int use_multiple_frame_contexts;
+
+ char message_buffer[2048];
+ vpx_codec_ctx_t *codec_ctx;
+} SvcInternal_t;
+
/**
* Set SVC options
* options are supplied as a single string separated by spaces
@@ -54,14 +84,17 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
/**
* initialize SVC encoding
*/
-vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx,
+ vpx_codec_ctx_t *codec_ctx,
vpx_codec_iface_t *iface,
vpx_codec_enc_cfg_t *cfg);
/**
* encode a frame of video with multiple layers
*/
-vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
- struct vpx_image *rawimg, vpx_codec_pts_t pts,
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+ vpx_codec_ctx_t *codec_ctx,
+ struct vpx_image *rawimg,
+ vpx_codec_pts_t pts,
int64_t duration, int deadline);
/**
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
index 77d9d6a1c52..0e8adc134c5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
@@ -10,15 +10,16 @@
#ifndef VPX_VP8CX_H_
#define VPX_VP8CX_H_
-/*!\defgroup vp8_encoder WebM VP8 Encoder
+/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
* \ingroup vp8
*
* @{
*/
#include "./vp8.h"
+#include "./vpx_encoder.h"
/*!\file
- * \brief Provides definitions for using the VP8 encoder algorithm within the
+ * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
* vpx Codec Interface.
*/
@@ -28,17 +29,20 @@ extern "C" {
/*!\name Algorithm interface for VP8
*
- * This interface provides the capability to encode raw VP8 streams, as would
- * be found in AVI files.
+ * This interface provides the capability to encode raw VP8 streams.
* @{
*/
extern vpx_codec_iface_t vpx_codec_vp8_cx_algo;
extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
+/*!@} - end algorithm interface member group*/
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
extern vpx_codec_iface_t vpx_codec_vp9_cx_algo;
extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
-
/*!@} - end algorithm interface member group*/
@@ -121,66 +125,145 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
#define VP8_EFLAG_NO_UPD_ENTROPY (1<<20)
-/*!\brief VP8 encoder control functions
+/*!\brief VPx encoder control functions
*
- * This set of macros define the control functions available for the VP8
+ * This set of macros define the control functions available for VPx
* encoder interface.
*
* \sa #vpx_codec_control
*/
enum vp8e_enc_control_id {
- VP8E_UPD_ENTROPY = 5, /**< control function to set mode of entropy update in encoder */
- VP8E_UPD_REFERENCE, /**< control function to set reference update mode in encoder */
- VP8E_USE_REFERENCE, /**< control function to set which reference frame encoder can use */
- VP8E_SET_ROI_MAP, /**< control function to pass an ROI map to encoder */
- VP8E_SET_ACTIVEMAP, /**< control function to pass an Active map to encoder */
- VP8E_SET_SCALEMODE = 11, /**< control function to set encoder scaling mode */
- /*!\brief control function to set vp8 encoder cpuused
+ /*!\brief Codec control function to set mode of entropy update in encoder.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_UPD_ENTROPY = 5,
+
+ /*!\brief Codec control function to set reference update mode in encoder.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_UPD_REFERENCE,
+
+ /*!\brief Codec control function to set which reference frame encoder can use.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_USE_REFERENCE,
+
+ /*!\brief Codec control function to pass an ROI map to encoder.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_ROI_MAP,
+
+ /*!\brief Codec control function to pass an Active map to encoder.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_ACTIVEMAP,
+
+ /*!\brief Codec control function to set encoder scaling mode.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_SCALEMODE = 11,
+
+ /*!\brief Codec control function to set encoder internal speed settings.
*
* Changes in this value influences, among others, the encoder's selection
* of motion estimation methods. Values greater than 0 will increase encoder
* speed at the expense of quality.
- * The full set of adjustments can be found in
- * onyx_if.c:vp8_set_speed_features().
- * \todo List highlights of the changes at various levels.
*
- * \note Valid range: -16..16
+ * \note Valid range for VP8: -16..16
+ * \note Valid range for VP9: -8..8
+ *
+ * Supported in codecs: VP8, VP9
*/
VP8E_SET_CPUUSED = 13,
- VP8E_SET_ENABLEAUTOALTREF, /**< control function to enable vp8 to automatic set and use altref frame */
+
+ /*!\brief Codec control function to enable automatic set and use alf frames.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_ENABLEAUTOALTREF,
+
/*!\brief control function to set noise sensitivity
*
* 0: off, 1: OnYOnly, 2: OnYUV,
* 3: OnYUVAggressive, 4: Adaptive
+ *
+ * Supported in codecs: VP8
*/
VP8E_SET_NOISE_SENSITIVITY,
- VP8E_SET_SHARPNESS, /**< control function to set sharpness */
- VP8E_SET_STATIC_THRESHOLD, /**< control function to set the threshold for macroblocks treated static */
- VP8E_SET_TOKEN_PARTITIONS, /**< control function to set the number of token partitions */
- VP8E_GET_LAST_QUANTIZER, /**< return the quantizer chosen by the
- encoder for the last frame using the internal
- scale */
- VP8E_GET_LAST_QUANTIZER_64, /**< return the quantizer chosen by the
- encoder for the last frame, using the 0..63
- scale as used by the rc_*_quantizer config
- parameters */
- VP8E_SET_ARNR_MAXFRAMES, /**< control function to set the max number of frames blurred creating arf*/
- VP8E_SET_ARNR_STRENGTH, //!< control function to set the filter
- //!< strength for the arf
-
- /*!\deprecated control function to set the filter type to use for the arf */
+
+ /*!\brief Codec control function to set sharpness.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_SHARPNESS,
+
+ /*!\brief Codec control function to set the threshold for MBs treated static.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_STATIC_THRESHOLD,
+
+ /*!\brief Codec control function to set the number of token partitions.
+ *
+ * Supported in codecs: VP8
+ */
+ VP8E_SET_TOKEN_PARTITIONS,
+
+ /*!\brief Codec control function to get last quantizer chosen by the encoder.
+ *
+ * Return value uses internal quantizer scale defined by the codec.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_GET_LAST_QUANTIZER,
+
+ /*!\brief Codec control function to get last quantizer chosen by the encoder.
+ *
+ * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+ * parameters.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_GET_LAST_QUANTIZER_64,
+
+ /*!\brief Codec control function to set the max no of frames to create arf.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_ARNR_MAXFRAMES,
+
+ /*!\brief Codec control function to set the filter strength for the arf.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_ARNR_STRENGTH,
+
+ /*!\deprecated control function to set the filter type to use for the arf. */
VP8E_SET_ARNR_TYPE,
- VP8E_SET_TUNING, /**< control function to set visual tuning */
- /*!\brief control function to set constrained quality level
+ /*!\brief Codec control function to set visual tuning.
+ *
+ * Supported in codecs: VP8, VP9
+ */
+ VP8E_SET_TUNING,
+
+ /*!\brief Codec control function to set constrained quality level.
*
* \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
* set to #VPX_CQ.
* \note Valid range: 0..63
+ *
+ * Supported in codecs: VP8, VP9
*/
VP8E_SET_CQ_LEVEL,
- /*!\brief Max data rate for Intra frames
+ /*!\brief Codec control function to set Max data rate for Intra frames.
*
* This value controls additional clamping on the maximum size of a
* keyframe. It is expressed as a percentage of the average
@@ -191,32 +274,246 @@ enum vp8e_enc_control_id {
* For example, to allocate no more than 4.5 frames worth of bitrate
* to a keyframe, set this to 450.
*
+ * Supported in codecs: VP8, VP9
*/
VP8E_SET_MAX_INTRA_BITRATE_PCT,
+ /*!\brief Codec control function to set reference and update frame flags.
+ *
+ * Supported in codecs: VP8
+ */
+ VP8E_SET_FRAME_FLAGS,
+
+ /*!\brief Codec control function to set max data rate for Inter frames.
+ *
+ * This value controls additional clamping on the maximum size of an
+ * inter frame. It is expressed as a percentage of the average
+ * per-frame bitrate, with the special (and default) value 0 meaning
+ * unlimited, or no additional clamping beyond the codec's built-in
+ * algorithm.
+ *
+ * For example, to allow no more than 4.5 frames worth of bitrate
+ * to an inter frame, set this to 450.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_MAX_INTER_BITRATE_PCT,
- /* TODO(jkoleszar): Move to vp9cx.h */
+ /*!\brief Boost percentage for Golden Frame in CBR mode.
+ *
+ * This value controls the amount of boost given to Golden Frame in
+ * CBR mode. It is expressed as a percentage of the average
+ * per-frame bitrate, with the special (and default) value 0 meaning
+ * the feature is off, i.e., no golden frame boost in CBR mode and
+ * average bitrate target is used.
+ *
+ * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+ * than average frame, set this to 100.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_GF_CBR_BOOST_PCT,
+
+ /*!\brief Codec control function to set the temporal layer id.
+ *
+ * For temporal scalability: this control allows the application to set the
+ * layer id for each frame to be encoded. Note that this control must be set
+ * for every frame prior to encoding. The usage of this control function
+ * supersedes the internal temporal pattern counter, which is now deprecated.
+ *
+ * Supported in codecs: VP8
+ */
+ VP8E_SET_TEMPORAL_LAYER_ID,
+
+ /*!\brief Codec control function to set encoder screen content mode.
+ *
+ * Supported in codecs: VP8
+ */
+ VP8E_SET_SCREEN_CONTENT_MODE,
+
+ /*!\brief Codec control function to set lossless encoding mode.
+ *
+ * VP9 can operate in lossless encoding mode, in which the bitstream
+ * produced will be able to decode and reconstruct a perfect copy of
+ * input source. This control function provides a mean to switch encoder
+ * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+ * 0 = lossy coding mode
+ * 1 = lossless coding mode
+ *
+ * By default, encoder operates in normal coding mode (maybe lossy).
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_LOSSLESS,
+
+ /*!\brief Codec control function to set number of tile columns.
+ *
+ * In encoding and decoding, VP9 allows an input image frame be partitioned
+ * into separated vertical tile columns, which can be encoded or decoded
+ * independently. This enables easy implementation of parallel encoding and
+ * decoding. This control requests the encoder to use column tiles in
+ * encoding an input frame, with number of tile columns (in Log2 unit) as
+ * the parameter:
+ * 0 = 1 tile column
+ * 1 = 2 tile columns
+ * 2 = 4 tile columns
+ * .....
+ * n = 2**n tile columns
+ * The requested tile columns will be capped by encoder based on image size
+ * limitation (The minimum width of a tile column is 256 pixel, the maximum
+ * is 4096).
+ *
+ * By default, the value is 0, i.e. one single column tile for entire image.
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_TILE_COLUMNS,
+
+ /*!\brief Codec control function to set number of tile rows.
+ *
+ * In encoding and decoding, VP9 allows an input image frame be partitioned
+ * into separated horizontal tile rows. Tile rows are encoded or decoded
+ * sequentially. Even though encoding/decoding of later tile rows depends on
+ * earlier ones, this allows the encoder to output data packets for tile rows
+ * prior to completely processing all tile rows in a frame, thereby reducing
+ * the latency in processing between input and output. The parameter
+ * for this control describes the number of tile rows, which has a valid
+ * range [0, 2]:
+ * 0 = 1 tile row
+ * 1 = 2 tile rows
+ * 2 = 4 tile rows
+ *
+ * By default, the value is 0, i.e. one single row tile for entire image.
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_TILE_ROWS,
+
+ /*!\brief Codec control function to enable frame parallel decoding feature.
+ *
+ * VP9 has a bitstream feature to reduce decoding dependency between frames
+ * by turning off backward update of probability context used in encoding
+ * and decoding. This allows staged parallel processing of more than one
+ * video frames in the decoder. This control function provides a mean to
+ * turn this feature on or off for bitstreams produced by encoder.
+ *
+ * By default, this feature is off.
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_FRAME_PARALLEL_DECODING,
+
+ /*!\brief Codec control function to set adaptive quantization mode.
+ *
+ * VP9 has a segment based feature that allows encoder to adaptively change
+ * quantization parameter for each segment within a frame to improve the
+ * subjective quality. This control makes encoder operate in one of the
+ * several AQ_modes supported.
+ *
+ * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_AQ_MODE,
+
+ /*!\brief Codec control function to enable/disable periodic Q boost.
+ *
+ * One VP9 encoder speed feature is to enable quality boost by lowering
+ * frame level Q periodically. This control function provides a mean to
+ * turn on/off this feature.
+ * 0 = off
+ * 1 = on
+ *
+ * By default, the encoder is allowed to use this feature for appropriate
+ * encoding modes.
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_FRAME_PERIODIC_BOOST,
- /*!\brief control function to set noise sensitivity
+
+ /*!\brief Codec control function to set noise sensitivity.
+ *
+ * 0: off, 1: On(YOnly)
*
- * 0: off, 1: OnYOnly
+ * Supported in codecs: VP9
*/
VP9E_SET_NOISE_SENSITIVITY,
+ /*!\brief Codec control function to turn on/off SVC in encoder.
+ * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not
+ * support SVC in its current encoding mode
+ * 0: off, 1: on
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_SVC,
+
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+ /*!\brief Codec control function to set parameters for SVC.
+ * \note Parameters contain min_q, max_q, scaling factor for each of the
+ * SVC layers.
+ *
+ * Supported in codecs: VP9
+ */
VP9E_SET_SVC_PARAMETERS,
- /*!\brief control function to set svc layer for spatial and temporal.
+#endif
+
+ /*!\brief Codec control function to set svc layer for spatial and temporal.
* \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial
* layer and 0..#vpx_codec_enc_cfg::ts_number_layers for
* temporal layer.
+ *
+ * Supported in codecs: VP9
*/
VP9E_SET_SVC_LAYER_ID,
- VP9E_SET_TUNE_CONTENT
+
+ /*!\brief Codec control function to set content type.
+ * \note Valid parameter range:
+ * VP9E_CONTENT_DEFAULT = Regular video content (Default)
+ * VP9E_CONTENT_SCREEN = Screen capture content
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_TUNE_CONTENT,
+
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+ /*!\brief Codec control function to get svc layer ID.
+ * \note The layer ID returned is for the data packet from the registered
+ * callback function.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_GET_SVC_LAYER_ID,
+
+ /*!\brief Codec control function to register callback to get per layer packet.
+ * \note Parameter for this control function is a structure with a callback
+ * function and a pointer to private data used by the callback.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_REGISTER_CX_CALLBACK,
+#endif
+
+ /*!\brief Codec control function to set color space info.
+ * \note Valid ranges: 0..7, default is "UNKNOWN".
+ * 0 = UNKNOWN,
+ * 1 = BT_601
+ * 2 = BT_709
+ * 3 = SMPTE_170
+ * 4 = SMPTE_240
+ * 5 = BT_2020
+ * 6 = RESERVED
+ * 7 = SRGB
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_COLOR_SPACE,
+
+ /*!\brief Codec control function to get an Active map back from the encoder.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_GET_ACTIVEMAP,
};
/*!\brief vpx 1-D scaling mode
@@ -305,6 +602,7 @@ typedef enum {
VP8_TUNE_SSIM
} vp8e_tuning;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
/*!\brief vp9 svc layer parameters
*
* This defines the spatial and temporal layer id numbers for svc encoding.
@@ -316,6 +614,18 @@ typedef struct vpx_svc_layer_id {
int spatial_layer_id; /**< Spatial layer id number. */
int temporal_layer_id; /**< Temporal layer id number. */
} vpx_svc_layer_id_t;
+#else
+/*!\brief vp9 svc layer parameters
+ *
+ * This defines the temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the
+ * temporal layer id for the current frame.
+ *
+ */
+typedef struct vpx_svc_layer_id {
+ int temporal_layer_id; /**< Temporal layer id number. */
+} vpx_svc_layer_id_t;
+#endif
/*!\brief VP8 encoder control function parameter type
*
@@ -332,12 +642,17 @@ VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_ENTROPY, int)
VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_REFERENCE, int)
VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE, int)
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
+#endif
VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int)
@@ -358,8 +673,16 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#endif
VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
@@ -372,6 +695,10 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int)
VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+
+VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
/*! @} - end defgroup vp8_encoder */
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
index 379b3062089..83898bf8496 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
@@ -9,13 +9,13 @@
*/
-/*!\defgroup vp8_decoder WebM VP8 Decoder
+/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
* \ingroup vp8
*
* @{
*/
/*!\file
- * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
+ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
* interface.
*/
#ifndef VPX_VP8DX_H_
@@ -30,14 +30,18 @@ extern "C" {
/*!\name Algorithm interface for VP8
*
- * This interface provides the capability to decode raw VP8 streams, as would
- * be found in AVI files and other non-Flash uses.
+ * This interface provides the capability to decode VP8 streams.
* @{
*/
extern vpx_codec_iface_t vpx_codec_vp8_dx_algo;
extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to decode VP9 streams.
+ * @{
+ */
extern vpx_codec_iface_t vpx_codec_vp9_dx_algo;
extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
/*!@} - end algorithm interface member group*/
@@ -72,13 +76,34 @@ enum vp8_dec_control_id {
VPXD_SET_DECRYPTOR,
VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR,
- /** control function to get the display dimensions for the current frame. */
+ /** control function to get the dimensions that the current frame is decoded
+ * at. This may be different to the intended display size for the frame as
+ * specified in the wrapper or frame header (see VP9D_GET_DISPLAY_SIZE). */
+ VP9D_GET_FRAME_SIZE,
+
+ /** control function to get the current frame's intended display dimensions
+ * (as specified in the wrapper or frame header). This may be different to
+ * the decoded dimensions of this frame (see VP9D_GET_FRAME_SIZE). */
VP9D_GET_DISPLAY_SIZE,
/** control function to get the bit depth of the stream. */
VP9D_GET_BIT_DEPTH,
- /** For testing. */
+ /** control function to set the byte alignment of the planes in the reference
+ * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+ * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+ * follows Y plane, and V plane directly follows U plane. Default value is 0.
+ */
+ VP9_SET_BYTE_ALIGNMENT,
+
+ /** control function to invert the decoding order to from right to left. The
+ * function is used in a test to confirm the decoding independence of tile
+ * columns. The function may be used in application where this order
+ * of decoding is desired.
+ *
+ * TODO(yaowu): Rework the unit test that uses this control, and in a future
+ * release, this test-only control shall be removed.
+ */
VP9_INVERT_TILE_DECODE_ORDER,
VP8_DECODER_CTRL_ID_MAX
@@ -122,6 +147,7 @@ VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
/*! @} - end defgroup vp8_decoder */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
index b25308ed9a6..b94e17370a2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
@@ -83,7 +83,7 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_CODEC_ABI_VERSION (2 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
/*!\brief Algorithm return codes */
typedef enum {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
index 044243d6b73..bf75584d589 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
@@ -59,7 +59,7 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_ENCODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION (4 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
@@ -161,9 +161,9 @@ extern "C" {
VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */
VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */
- // TODO(minghai): This is for testing purporses. The released library can't
- // depend on vpx_config.h
-#if defined(CONFIG_SPATIAL_SVC) && CONFIG_SPATIAL_SVC
+ // Spatial SVC is still experimental and may be removed before the next ABI
+ // bump.
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
#endif
@@ -203,9 +203,9 @@ extern "C" {
double psnr[4]; /**< PSNR, total/y/u/v */
} psnr; /**< data for PSNR packet */
vpx_fixed_buf_t raw; /**< data for arbitrary packets */
- // TODO(minghai): This is for testing purporses. The released library
- // can't depend on vpx_config.h
-#if defined(CONFIG_SPATIAL_SVC) && CONFIG_SPATIAL_SVC
+ // Spatial SVC is still experimental and may be removed before the next
+ // ABI bump.
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
size_t layer_sizes[VPX_SS_MAX_LAYERS];
struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
#endif
@@ -220,6 +220,22 @@ extern "C" {
} vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
+ /*!\brief Encoder return output buffer callback
+ *
+ * This callback function, when registered, returns with packets when each
+ * spatial layer is encoded.
+ */
+ // putting the definitions here for now. (agrange: find if there
+ // is a better place for this)
+ typedef void (* vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
+ void *user_data);
+
+ /*!\brief Callback function pointer / user data pair storage */
+ typedef struct vpx_codec_enc_output_cx_cb_pair {
+ vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
+ void *user_priv; /**< Pointer to private data */
+ } vpx_codec_priv_output_cx_pkt_cb_pair_t;
+
/*!\brief Rational Number
*
* This structure holds a fractional value.
@@ -721,10 +737,10 @@ extern "C" {
*
*/
typedef struct vpx_svc_parameters {
- int max_quantizers[VPX_SS_MAX_LAYERS];
- int min_quantizers[VPX_SS_MAX_LAYERS];
- int scaling_factor_num[VPX_SS_MAX_LAYERS];
- int scaling_factor_den[VPX_SS_MAX_LAYERS];
+ int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */
+ int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */
+ int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/
+ int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/
} vpx_svc_extra_cfg_t;
@@ -811,9 +827,9 @@ extern "C" {
* be called by all applications to initialize the configuration structure
* before specializing the configuration with application specific values.
*
- * \param[in] iface Pointer to the algorithm interface to use.
- * \param[out] cfg Configuration buffer to populate
- * \param[in] usage End usage. Set to 0 or use codec specific values.
+ * \param[in] iface Pointer to the algorithm interface to use.
+ * \param[out] cfg Configuration buffer to populate.
+ * \param[in] reserved Must set to 0 for VP8 and VP9.
*
* \retval #VPX_CODEC_OK
* The configuration was populated.
@@ -824,7 +840,7 @@ extern "C" {
*/
vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
vpx_codec_enc_cfg_t *cfg,
- unsigned int usage);
+ unsigned int reserved);
/*!\brief Set or change configuration
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h
index 41038b10df6..9036459af0a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h
@@ -22,8 +22,11 @@ extern "C" {
#include "./vpx_integer.h"
/*!\brief The maximum number of work buffers used by libvpx.
+ * Support maximum 4 threads to decode video in parallel.
+ * Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
*/
-#define VPX_MAXIMUM_WORK_BUFFERS 1
+#define VPX_MAXIMUM_WORK_BUFFERS 8
/*!\brief The maximum number of reference buffers that a VP9 encoder may use.
*/
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h
index 337e4c4bedb..c06d35101cc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h
@@ -28,7 +28,7 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */
@@ -66,9 +66,22 @@ extern "C" {
VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+ /*!\brief List of supported color spaces */
+ typedef enum vpx_color_space {
+ VPX_CS_UNKNOWN = 0, /**< Unknown */
+ VPX_CS_BT_601 = 1, /**< BT.601 */
+ VPX_CS_BT_709 = 2, /**< BT.709 */
+ VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */
+ VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */
+ VPX_CS_BT_2020 = 5, /**< BT.2020 */
+ VPX_CS_RESERVED = 6, /**< Reserved */
+ VPX_CS_SRGB = 7 /**< sRGB */
+ } vpx_color_space_t; /**< alias for enum vpx_color_space */
+
/**\brief Image Descriptor */
typedef struct vpx_image {
vpx_img_fmt_t fmt; /**< Image Format */
+ vpx_color_space_t cs; /**< Color Space */
/* Image storage dimensions */
unsigned int w; /**< Stored image width */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h
index 500f9b901eb..829c9d132c8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h
@@ -37,6 +37,8 @@ typedef unsigned int uint32_t;
typedef signed __int64 int64_t;
typedef unsigned __int64 uint64_t;
#define INT64_MAX _I64_MAX
+#define INT32_MAX _I32_MAX
+#define INT32_MIN _I32_MIN
#define INT16_MAX _I16_MAX
#define INT16_MIN _I16_MIN
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c
new file mode 100644
index 00000000000..c7704dc1be6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+ const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+ vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+ vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8x16_t vec_src_32,
+ const uint8x16_t vec_src_48,
+ const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+ const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+ const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+ vget_low_u8(vec_ref_32));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+ vget_high_u8(vec_ref_32));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+ vget_low_u8(vec_ref_48));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+ vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 64; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+ const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+ const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 32; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+ sad_neon_32(vec_src_00, vec_src_16, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ uint32_t *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t vec_src = vld1q_u8(src);
+ const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+ const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+ const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+ const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+ vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref0));
+ vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref0));
+ vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref1));
+ vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref1));
+ vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref2));
+ vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref2));
+ vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref3));
+ vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref3));
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm
index 1b4f5cf3b0f..aed1d3a22ed 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm
@@ -9,7 +9,7 @@
;
- EXPORT |vp8_sad16x16_armv6|
+ EXPORT |vpx_sad16x16_media|
ARM
REQUIRE8
@@ -21,8 +21,7 @@
; r1 int src_stride
; r2 const unsigned char *ref_ptr
; r3 int ref_stride
-; stack max_sad (not used)
-|vp8_sad16x16_armv6| PROC
+|vpx_sad16x16_media| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c
index c4cd856804d..173f08ac3c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c
@@ -9,11 +9,113 @@
*/
#include <arm_neon.h>
-#include "./vp9_rtcd.h"
+
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+unsigned int vpx_sad8x16_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 15; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vpx_sad4x4_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x2_t d1;
+ uint64x1_t d3;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 3; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ d1 = vpaddl_u16(vget_low_u16(q12));
+ d3 = vpaddl_u32(d1);
+
+ return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vpx_sad16x8_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+ for (i = 0; i < 7; i++) {
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
@@ -34,7 +136,7 @@ static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
return vget_lane_u32(c, 0);
}
-unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -70,7 +172,7 @@ unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
}
-unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -95,7 +197,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
-unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -114,7 +216,7 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
-unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum = vdupq_n_u16(0);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
index 73134f2f2c0..9db312fbe05 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -10,15 +10,19 @@
#include <stdlib.h>
-#include "./vp9_rtcd.h"
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+
#if CONFIG_VP9_HIGHBITDEPTH
#include "vp9/common/vp9_common.h"
-#endif
-#include "vp9/encoder/vp9_variance.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
+// Temporary ...
+#define ROUND_POWER_OF_TWO(value, n) \
+ (((value) + (1 << ((n) - 1))) >> (n))
+/* Sum the difference between every corresponding element of the buffers. */
static INLINE unsigned int sad(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int width, int height) {
@@ -35,35 +39,78 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
return sad;
}
+/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
+ * The function averages every corresponding element of the buffers and stores
+ * the value in a third buffer, comp_pred.
+ * pred and comp_pred are assumed to have stride = width
+ * In the usage below comp_pred is a local array.
+ */
+static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#define sadMxN(m, n) \
-unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride) { \
return sad(src, src_stride, ref, ref_stride, m, n); \
} \
-unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred) { \
uint8_t comp_pred[m * n]; \
- vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+ avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
return sad(src, src_stride, comp_pred, m, m, n); \
}
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
#define sadMxNxK(m, n, k) \
-void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- unsigned int *sads) { \
+void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref_array, int ref_stride, \
+ uint32_t *sad_array) { \
int i; \
for (i = 0; i < k; ++i) \
- sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+ sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
}
+// This appears to be equivalent to the above when k == 4 and refs is const
#define sadMxNx4D(m, n) \
-void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const refs[], int ref_stride, \
- unsigned int *sads) { \
+void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], int ref_stride, \
+ uint32_t *sad_array) { \
int i; \
for (i = 0; i < 4; ++i) \
- sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+ sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
}
// 64x64
@@ -169,40 +216,40 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
}
#define highbd_sadMxN(m, n) \
-unsigned int vp9_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride) { \
return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
} \
-unsigned int vp9_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
+unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *second_pred) { \
uint16_t comp_pred[m * n]; \
- vp9_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+ highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
}
#define highbd_sadMxNxK(m, n, k) \
-void vp9_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- unsigned int *sads) { \
+void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref_array, int ref_stride, \
+ uint32_t *sad_array) { \
int i; \
for (i = 0; i < k; ++i) { \
- sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, &ref[i], \
- ref_stride); \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
+ ref_stride); \
} \
}
#define highbd_sadMxNx4D(m, n) \
-void vp9_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const refs[], \
- int ref_stride, unsigned int *sads) { \
+void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
int i; \
for (i = 0; i < 4; ++i) { \
- sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, refs[i], \
- ref_stride); \
- } \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
+ ref_stride); \
+ } \
}
// 64x64
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
new file mode 100644
index 00000000000..606515d2c19
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -0,0 +1,40 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+DSP_SRCS-yes += vpx_dsp.mk
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += sad.c
+
+DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
+
+DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+endif # CONFIG_VP9_HIGHBITDEPTH
+endif # CONFIG_ENCODERS
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += vpx_dsp_rtcd.c
+DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
+
+$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c
new file mode 100644
index 00000000000..5fe27b614bd
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vpx_dsp_rtcd() {
+ once(setup_rtcd_internal);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
new file mode 100644
index 00000000000..ebec9ec0660
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -0,0 +1,395 @@
+sub vpx_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+
+EOF
+}
+forward_decls qw/vpx_dsp_forward_decls/;
+
+# Functions which use x86inc.asm instead of x86_abi_support.asm
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+ $mmx_x86inc = 'mmx';
+ $sse_x86inc = 'sse';
+ $sse2_x86inc = 'sse2';
+ $ssse3_x86inc = 'ssse3';
+ $avx_x86inc = 'avx';
+ $avx2_x86inc = 'avx2';
+} else {
+ $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
+ $avx_x86inc = $avx2_x86inc = '';
+}
+
+# Functions which are 64 bit only.
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+} else {
+ $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
+ $avx_x86_64 = $avx2_x86_64 = '';
+}
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x64 avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x32 avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x64 avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x32 avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x16 avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x16 mmx media neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x8 mmx neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x16 mmx neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x8 mmx neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x4/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x8/, "$sse_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x4 mmx neon/, "$sse_x86inc";
+
+#
+# Avg
+#
+add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x64_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x32_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x64_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x32_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x16_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x4_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x8_avg/, "$sse_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x4_avg/, "$sse_x86inc";
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+# Blocks of 3
+add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x3 sse3 ssse3/;
+
+add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x3 sse3 ssse3/;
+
+add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x3 sse3/;
+
+add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x3 sse3/;
+
+add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x3 sse3/;
+
+# Blocks of 8
+add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x8 sse4_1/;
+
+add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x8 sse4_1/;
+
+add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x8 sse4_1/;
+
+add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x8 sse4_1/;
+
+add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x8 sse4_1/;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x4d avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x32x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x64x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x4d avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x16x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x32x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x4d neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x4d/, "$sse_x86inc";
+
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x4d/, "$sse_x86inc";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Single block SAD
+ #
+ add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x8/;
+
+ add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vpx_highbd_sad4x4/;
+
+ #
+ # Avg
+ #
+ add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
+
+ add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x8_avg/;
+
+ add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_sad4x4_avg/;
+
+ #
+ # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+ #
+ # Blocks of 3
+ add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad64x64x3/;
+
+ add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad32x32x3/;
+
+ add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x16x3/;
+
+ add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x8x3/;
+
+ add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x16x3/;
+
+ add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x8x3/;
+
+ add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad4x4x3/;
+
+ # Blocks of 8
+ add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad64x64x8/;
+
+ add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad32x32x8/;
+
+ add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x16x8/;
+
+ add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x8x8/;
+
+ add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x16x8/;
+
+ add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x8x8/;
+
+ add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x4x8/;
+
+ add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad4x8x8/;
+
+ add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad4x4x8/;
+
+ #
+ # Multi-block SAD, comparing a reference to N independent blocks
+ #
+ add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
+
+ add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
+
+} # CONFIG_VP9_HIGHBITDEPTH
+} # CONFIG_ENCODERS
+
+1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 00000000000..95cc4372ec3
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,289 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+
+; set m1
+ push srcq
+ mov srcd, 0x00010001
+ movd m1, srcd
+ pshufd m1, m1, 0x0
+ pop srcq
+
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 00000000000..4d422dde3af
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,365 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+ mov n_rowsd, %1/2
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m2
+ psubusw m2, [srcq+src_strideq*2]
+ por m2, m5
+ mova m5, [srcq+src_strideq*4]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*4]
+ por m3, m5
+ mova m5, [srcq+src_stride3q*2]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_stride3q*2]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c
index 1feed62566b..4128f2ac37c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -10,11 +10,11 @@
#include <immintrin.h> // AVX2
#include "vpx/vpx_integer.h"
-void vp9_sad32x32x4d_avx2(uint8_t *src,
+void vpx_sad32x32x4d_avx2(uint8_t *src,
int src_stride,
uint8_t *ref[4],
int ref_stride,
- unsigned int res[4]) {
+ uint32_t res[4]) {
__m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
__m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
__m256i sum_mlow, sum_mhigh;
@@ -80,11 +80,11 @@ void vp9_sad32x32x4d_avx2(uint8_t *src,
}
}
-void vp9_sad64x64x4d_avx2(uint8_t *src,
+void vpx_sad64x64x4d_avx2(uint8_t *src,
int src_stride,
uint8_t *ref[4],
int ref_stride,
- unsigned int res[4]) {
+ uint32_t res[4]) {
__m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
__m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
__m256i ref3_reg, ref3next_reg;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_sse2.asm
index b4936281f62..0f7fb93d47c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_sse2.asm
@@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
+%define program_name vpx
+
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@@ -167,9 +169,9 @@ SECTION .text
PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
%endmacro
-; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; uint8_t *ref[4], int ref_stride,
-; unsigned int res[4]);
+; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
%macro SADNXN4D 2
%if UNIX64
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c
index 113193070e1..78536a47218 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -11,7 +11,7 @@
#include "vpx_ports/mem.h"
#define FSAD64_H(h) \
-unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride) { \
@@ -40,7 +40,7 @@ unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
}
#define FSAD32_H(h) \
-unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride) { \
@@ -89,7 +89,7 @@ FSAD32;
#undef FSAD32_H
#define FSADAVG64_H(h) \
-unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride, \
@@ -124,7 +124,7 @@ unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
}
#define FSADAVG32_H(h) \
-unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_mmx.asm
index 592112fa91d..9968992bd13 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_mmx.asm
@@ -11,18 +11,18 @@
%include "vpx_ports/x86_abi_support.asm"
-global sym(vp8_sad16x16_mmx) PRIVATE
-global sym(vp8_sad8x16_mmx) PRIVATE
-global sym(vp8_sad8x8_mmx) PRIVATE
-global sym(vp8_sad4x4_mmx) PRIVATE
-global sym(vp8_sad16x8_mmx) PRIVATE
+global sym(vpx_sad16x16_mmx) PRIVATE
+global sym(vpx_sad8x16_mmx) PRIVATE
+global sym(vpx_sad8x8_mmx) PRIVATE
+global sym(vpx_sad4x4_mmx) PRIVATE
+global sym(vpx_sad16x8_mmx) PRIVATE
-;unsigned int vp8_sad16x16_mmx(
+;unsigned int vpx_sad16x16_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
-sym(vp8_sad16x16_mmx):
+sym(vpx_sad16x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@@ -109,12 +109,12 @@ sym(vp8_sad16x16_mmx):
ret
-;unsigned int vp8_sad8x16_mmx(
+;unsigned int vpx_sad8x16_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
-sym(vp8_sad8x16_mmx):
+sym(vpx_sad8x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@@ -181,12 +181,12 @@ sym(vp8_sad8x16_mmx):
ret
-;unsigned int vp8_sad8x8_mmx(
+;unsigned int vpx_sad8x8_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
-sym(vp8_sad8x8_mmx):
+sym(vpx_sad8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@@ -251,12 +251,12 @@ sym(vp8_sad8x8_mmx):
ret
-;unsigned int vp8_sad4x4_mmx(
+;unsigned int vpx_sad4x4_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
-sym(vp8_sad4x4_mmx):
+sym(vpx_sad4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@@ -340,12 +340,12 @@ sym(vp8_sad4x4_mmx):
ret
-;unsigned int vp8_sad16x8_mmx(
+;unsigned int vpx_sad16x8_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
-sym(vp8_sad16x8_mmx):
+sym(vpx_sad16x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse2.asm
index c4c5c54f0e4..c6a829dc21e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse2.asm
@@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
+%define program_name vpx
+
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@@ -44,7 +46,7 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
%endif ; %3 == 7
%endmacro
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1-2 0
SAD_FN 64, %1, 5, %2
@@ -87,7 +89,7 @@ SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
SAD64XN 32, 1 ; sad64x32_avg_sse2
-; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD32XN 1-2 0
SAD_FN 32, %1, 5, %2
@@ -132,7 +134,7 @@ SAD32XN 64, 1 ; sad32x64_avg_sse2
SAD32XN 32, 1 ; sad32x32_avg_sse2
SAD32XN 16, 1 ; sad32x16_avg_sse2
-; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD16XN 1-2 0
SAD_FN 16, %1, 7, %2
@@ -178,7 +180,7 @@ SAD16XN 32, 1 ; sad16x32_avg_sse2
SAD16XN 16, 1 ; sad16x16_avg_sse2
SAD16XN 8, 1 ; sad16x8_avg_sse2
-; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD8XN 1-2 0
SAD_FN 8, %1, 7, %2
@@ -222,7 +224,7 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2
SAD8XN 8, 1 ; sad8x8_avg_sse2
SAD8XN 4, 1 ; sad8x4_avg_sse2
-; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
+; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD4XN 1-2 0
SAD_FN 4, %1, 7, %2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
index 2b90a5d5478..18279bdb9de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -19,7 +19,6 @@
%define end_ptr rcx
%define ret_var rbx
%define result_ptr arg(4)
- %define max_err arg(4)
%define height dword ptr arg(4)
push rbp
mov rbp, rsp
@@ -42,7 +41,6 @@
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define max_err [rsp+xmm_stack_space+8+4*8]
%define height dword ptr [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
@@ -52,7 +50,6 @@
%define end_ptr r9
%define ret_var r10
%define result_ptr r8
- %define max_err r8
%define height r8
%endif
%endif
@@ -67,7 +64,6 @@
%define end_ptr
%define ret_var
%define result_ptr
- %define max_err
%define height
%if ABI_IS_32BIT
@@ -169,14 +165,14 @@
paddw mm7, mm3
%endmacro
-;void int vp9_sad16x16x3_sse3(
+;void int vpx_sad16x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp9_sad16x16x3_sse3) PRIVATE
-sym(vp9_sad16x16x3_sse3):
+global sym(vpx_sad16x16x3_sse3) PRIVATE
+sym(vpx_sad16x16x3_sse3):
STACK_FRAME_CREATE_X3
@@ -211,14 +207,14 @@ sym(vp9_sad16x16x3_sse3):
STACK_FRAME_DESTROY_X3
-;void int vp9_sad16x8x3_sse3(
+;void int vpx_sad16x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp9_sad16x8x3_sse3) PRIVATE
-sym(vp9_sad16x8x3_sse3):
+global sym(vpx_sad16x8x3_sse3) PRIVATE
+sym(vpx_sad16x8x3_sse3):
STACK_FRAME_CREATE_X3
@@ -249,14 +245,14 @@ sym(vp9_sad16x8x3_sse3):
STACK_FRAME_DESTROY_X3
-;void int vp9_sad8x16x3_sse3(
+;void int vpx_sad8x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp9_sad8x16x3_sse3) PRIVATE
-sym(vp9_sad8x16x3_sse3):
+global sym(vpx_sad8x16x3_sse3) PRIVATE
+sym(vpx_sad8x16x3_sse3):
STACK_FRAME_CREATE_X3
@@ -278,14 +274,14 @@ sym(vp9_sad8x16x3_sse3):
STACK_FRAME_DESTROY_X3
-;void int vp9_sad8x8x3_sse3(
+;void int vpx_sad8x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp9_sad8x8x3_sse3) PRIVATE
-sym(vp9_sad8x8x3_sse3):
+global sym(vpx_sad8x8x3_sse3) PRIVATE
+sym(vpx_sad8x8x3_sse3):
STACK_FRAME_CREATE_X3
@@ -303,14 +299,14 @@ sym(vp9_sad8x8x3_sse3):
STACK_FRAME_DESTROY_X3
-;void int vp9_sad4x4x3_sse3(
+;void int vpx_sad4x4x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp9_sad4x4x3_sse3) PRIVATE
-sym(vp9_sad4x4x3_sse3):
+global sym(vpx_sad4x4x3_sse3) PRIVATE
+sym(vpx_sad4x4x3_sse3):
STACK_FRAME_CREATE_X3
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
index faf1768a983..bc674479715 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -165,14 +165,14 @@
movdqa [rdi + 16], xmm2
%endmacro
-;void vp9_sad16x16x8_sse4(
+;void vpx_sad16x16x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array);
-global sym(vp9_sad16x16x8_sse4) PRIVATE
-sym(vp9_sad16x16x8_sse4):
+global sym(vpx_sad16x16x8_sse4_1) PRIVATE
+sym(vpx_sad16x16x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -205,15 +205,15 @@ sym(vp9_sad16x16x8_sse4):
ret
-;void vp9_sad16x8x8_sse4(
+;void vpx_sad16x8x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vp9_sad16x8x8_sse4) PRIVATE
-sym(vp9_sad16x8x8_sse4):
+global sym(vpx_sad16x8x8_sse4_1) PRIVATE
+sym(vpx_sad16x8x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -242,15 +242,15 @@ sym(vp9_sad16x8x8_sse4):
ret
-;void vp9_sad8x8x8_sse4(
+;void vpx_sad8x8x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vp9_sad8x8x8_sse4) PRIVATE
-sym(vp9_sad8x8x8_sse4):
+global sym(vpx_sad8x8x8_sse4_1) PRIVATE
+sym(vpx_sad8x8x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -279,15 +279,15 @@ sym(vp9_sad8x8x8_sse4):
ret
-;void vp9_sad8x16x8_sse4(
+;void vpx_sad8x16x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vp9_sad8x16x8_sse4) PRIVATE
-sym(vp9_sad8x16x8_sse4):
+global sym(vpx_sad8x16x8_sse4_1) PRIVATE
+sym(vpx_sad8x16x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -320,15 +320,15 @@ sym(vp9_sad8x16x8_sse4):
ret
-;void vp9_sad4x4x8_c(
+;void vpx_sad4x4x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
-global sym(vp9_sad4x4x8_sse4) PRIVATE
-sym(vp9_sad4x4x8_sse4):
+global sym(vpx_sad4x4x8_sse4_1) PRIVATE
+sym(vpx_sad4x4x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
index 278fc0640ed..49f204fa04b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_ssse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -146,14 +146,14 @@
%endmacro
-;void int vp8_sad16x16x3_ssse3(
+;void int vpx_sad16x16x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp8_sad16x16x3_ssse3) PRIVATE
-sym(vp8_sad16x16x3_ssse3):
+global sym(vpx_sad16x16x3_ssse3) PRIVATE
+sym(vpx_sad16x16x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -169,31 +169,31 @@ sym(vp8_sad16x16x3_ssse3):
mov rdx, 0xf
and rdx, rdi
- jmp .vp8_sad16x16x3_ssse3_skiptable
-.vp8_sad16x16x3_ssse3_jumptable:
- dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
- dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
-.vp8_sad16x16x3_ssse3_skiptable:
-
- call .vp8_sad16x16x3_ssse3_do_jump
-.vp8_sad16x16x3_ssse3_do_jump:
+ jmp .vpx_sad16x16x3_ssse3_skiptable
+.vpx_sad16x16x3_ssse3_jumptable:
+ dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
+ dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_skiptable:
+
+ call .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+ mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
add rcx, rax
@@ -203,23 +203,23 @@ sym(vp8_sad16x16x3_ssse3):
jmp rcx
- PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
-
-.vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
+
+.vpx_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
@@ -229,7 +229,7 @@ sym(vp8_sad16x16x3_ssse3):
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-.vp8_sad16x16x3_ssse3_store_off:
+.vpx_sad16x16x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
@@ -259,14 +259,14 @@ sym(vp8_sad16x16x3_ssse3):
pop rbp
ret
-;void int vp8_sad16x8x3_ssse3(
+;void int vpx_sad16x8x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
-global sym(vp8_sad16x8x3_ssse3) PRIVATE
-sym(vp8_sad16x8x3_ssse3):
+global sym(vpx_sad16x8x3_ssse3) PRIVATE
+sym(vpx_sad16x8x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -282,31 +282,31 @@ sym(vp8_sad16x8x3_ssse3):
mov rdx, 0xf
and rdx, rdi
- jmp .vp8_sad16x8x3_ssse3_skiptable
-.vp8_sad16x8x3_ssse3_jumptable:
- dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
- dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
-.vp8_sad16x8x3_ssse3_skiptable:
-
- call .vp8_sad16x8x3_ssse3_do_jump
-.vp8_sad16x8x3_ssse3_do_jump:
+ jmp .vpx_sad16x8x3_ssse3_skiptable
+.vpx_sad16x8x3_ssse3_jumptable:
+ dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
+ dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_skiptable:
+
+ call .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+ mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
add rcx, rax
@@ -316,30 +316,30 @@ sym(vp8_sad16x8x3_ssse3):
jmp rcx
- PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
-
-.vp8_sad16x8x3_ssse3_aligned_by_15:
+ PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
+
+.vpx_sad16x8x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-.vp8_sad16x8x3_ssse3_store_off:
+.vpx_sad16x8x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h
index 225a3babfe3..c4dd78550f3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h
@@ -13,35 +13,6 @@
#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
#include "./vpx_config.h"
-#ifndef CONFIG_MEM_MANAGER
-# if defined(VXWORKS)
-# define CONFIG_MEM_MANAGER 1 /*include heap manager functionality,*/
-/*default: enabled on vxworks*/
-# else
-# define CONFIG_MEM_MANAGER 0 /*include heap manager functionality*/
-# endif
-#endif /*CONFIG_MEM_MANAGER*/
-
-#ifndef CONFIG_MEM_TRACKER
-# define CONFIG_MEM_TRACKER 1 /*include xvpx_* calls in the lib*/
-#endif
-
-#ifndef CONFIG_MEM_CHECKS
-# define CONFIG_MEM_CHECKS 0 /*include some basic safety checks in
-vpx_memcpy, _memset, and _memmove*/
-#endif
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS 0 /*use function pointers instead of compiled functions.*/
-#endif
-
-#if CONFIG_MEM_TRACKER
-# include "vpx_mem_tracker.h"
-# if VPX_MEM_TRACKER_VERSION_CHIEF != 2 || VPX_MEM_TRACKER_VERSION_MAJOR != 5
-# error "vpx_mem requires memory tracker version 2.5 to track memory usage"
-# endif
-#endif
-
#define ADDRESS_STORAGE_SIZE sizeof(size_t)
#ifndef DEFAULT_ALIGNMENT
@@ -54,41 +25,6 @@ than vpx_memalign*/
# endif
#endif
-#if CONFIG_MEM_TRACKER
-# define TRY_BOUNDS_CHECK 1 /*when set to 1 pads each allocation,
-integrity can be checked using
-vpx_memory_tracker_check_integrity
-or on free by defining*/
-/*TRY_BOUNDS_CHECK_ON_FREE*/
-#else
-# define TRY_BOUNDS_CHECK 0
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if TRY_BOUNDS_CHECK
-# define TRY_BOUNDS_CHECK_ON_FREE 0 /*checks mem integrity on every
-free, very expensive*/
-# define BOUNDS_CHECK_VALUE 0xdeadbeef /*value stored before/after ea.
-mem addr for bounds checking*/
-# define BOUNDS_CHECK_PAD_SIZE 32 /*size of the padding before and
-after ea allocation to be filled
-with BOUNDS_CHECK_VALUE.
-this should be a multiple of 4*/
-#else
-# define BOUNDS_CHECK_VALUE 0
-# define BOUNDS_CHECK_PAD_SIZE 0
-#endif /*TRY_BOUNDS_CHECK*/
-
-#ifndef REMOVE_PRINTFS
-# define REMOVE_PRINTFS 0
-#endif
-
-/* Should probably use a vpx_mem logger function. */
-#if REMOVE_PRINTFS
-# define _P(x)
-#else
-# define _P(x) x
-#endif
-
/*returns an addr aligned to the byte boundary specified by align*/
#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h
deleted file mode 100644
index 1335e0017b3..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
-#define VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
-
-/* vpx_mem_tracker version info */
-#define vpx_mem_tracker_version "2.5.1.1"
-
-#define VPX_MEM_TRACKER_VERSION_CHIEF 2
-#define VPX_MEM_TRACKER_VERSION_MAJOR 5
-#define VPX_MEM_TRACKER_VERSION_MINOR 1
-#define VPX_MEM_TRACKER_VERSION_PATCH 1
-/* END - vpx_mem_tracker version info */
-
-#include <stdarg.h>
-
-struct mem_block {
- size_t addr;
- unsigned int size,
- line;
- char *file;
- struct mem_block *prev,
- * next;
-
- int padded; // This mem_block has padding for integrity checks.
- // As of right now, this should only be 0 if
- // using vpx_mem_alloc to allocate cache memory.
- // 2005-01-11 tjf
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
- /*
- vpx_memory_tracker_init(int padding_size, int pad_value)
- padding_size - the size of the padding before and after each mem addr.
- Values > 0 indicate that integrity checks can be performed
- by inspecting these areas.
- pad_value - the initial value within the padding area before and after
- each mem addr.
-
- Initializes the memory tracker interface. Should be called before any
- other calls to the memory tracker.
- */
- int vpx_memory_tracker_init(int padding_size, int pad_value);
-
- /*
- vpx_memory_tracker_destroy()
- Deinitializes the memory tracker interface
- */
- void vpx_memory_tracker_destroy();
-
- /*
- vpx_memory_tracker_add(size_t addr, unsigned int size,
- char * file, unsigned int line)
- addr - memory address to be added to list
- size - size of addr
- file - the file addr was referenced from
- line - the line in file addr was referenced from
- Adds memory address addr, it's size, file and line it came from
- to the memory tracker allocation table
- */
- void vpx_memory_tracker_add(size_t addr, unsigned int size,
- char *file, unsigned int line,
- int padded);
-
- /*
- vpx_memory_tracker_add(size_t addr, unsigned int size, char * file, unsigned int line)
- addr - memory address to be added to be removed
- padded - if 0, disables bounds checking on this memory block even if bounds
- checking is enabled. (for example, when allocating cache memory, we still want
- to check for memory leaks, but we do not waste cache space for bounds check padding)
- Removes the specified address from the memory tracker's allocation
- table
- Return:
- 0: on success
- -1: if memory allocation table's mutex could not be locked
- -2: if the addr was not found in the list
- */
- int vpx_memory_tracker_remove(size_t addr);
-
- /*
- vpx_memory_tracker_find(unsigned int addr)
- addr - address to be found in the memory tracker's
- allocation table
- Return:
- If found, pointer to the memory block that matches addr
- NULL otherwise
- */
- struct mem_block *vpx_memory_tracker_find(size_t addr);
-
- /*
- vpx_memory_tracker_dump()
- Dumps the current contents of the memory
- tracker allocation table
- */
- void vpx_memory_tracker_dump();
-
- /*
- vpx_memory_tracker_check_integrity()
- If a padding_size was provided to vpx_memory_tracker_init()
- This function will verify that the region before and after each
- memory address contains the specified pad_value. Should the check
- fail, the filename and line of the check will be printed out.
- */
- void vpx_memory_tracker_check_integrity(char *file, unsigned int line);
-
- /*
- vpx_memory_tracker_set_log_type
- type - value representing the logging type to use
- option - type specific option. This will be interpreted differently
- based on the type.
- Sets the logging type for the memory tracker.
- Values currently supported:
- 0: if option is NULL, log to stderr, otherwise interpret option as a
- filename and attempt to open it.
- 1: Use output_debug_string (WIN32 only), option ignored
- Return:
- 0: on success
- -1: if the logging type could not be set, because the value was invalid
- or because a file could not be opened
- */
- int vpx_memory_tracker_set_log_type(int type, char *option);
-
- /*
- vpx_memory_tracker_set_log_func
- userdata - ptr to be passed to the supplied logfunc, can be NULL
- logfunc - the logging function to be used to output data from
- vpx_memory_track_dump/check_integrity
- Sets a logging function to be used by the memory tracker.
- Return:
- 0: on success
- -1: if the logging type could not be set because logfunc was NULL
- */
- int vpx_memory_tracker_set_log_func(void *userdata,
- void(*logfunc)(void *userdata,
- const char *fmt, va_list args));
-
- /* Wrappers to standard library functions. */
- typedef void *(* mem_track_malloc_func)(size_t);
- typedef void *(* mem_track_calloc_func)(size_t, size_t);
- typedef void *(* mem_track_realloc_func)(void *, size_t);
- typedef void (* mem_track_free_func)(void *);
- typedef void *(* mem_track_memcpy_func)(void *, const void *, size_t);
- typedef void *(* mem_track_memset_func)(void *, int, size_t);
- typedef void *(* mem_track_memmove_func)(void *, const void *, size_t);
-
- /*
- vpx_memory_tracker_set_functions
-
- Sets the function pointers for the standard library functions.
-
- Return:
- 0: on success
- -1: if the use global function pointers is not set.
- */
- int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-, mem_track_calloc_func g_calloc_l
-, mem_track_realloc_func g_realloc_l
-, mem_track_free_func g_free_l
-, mem_track_memcpy_func g_memcpy_l
-, mem_track_memset_func g_memset_l
-, mem_track_memmove_func g_memmove_l);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif // VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c
deleted file mode 100644
index ab3562dfb3d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void *U(alloc)(U(descriptor) *desc, U(size_aau) n) {
-#ifdef HMM_AUDIT_FAIL
-
- if (desc->avl_tree_root)
- AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
- if (desc->last_freed) {
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(desc->last_freed)
-#endif
-
- U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-
- desc->last_freed = 0;
- }
-
- /* Add space for block header. */
- n += HEAD_AAUS;
-
- /* Convert n from number of address alignment units to block alignment
- ** units. */
- n = DIV_ROUND_UP(n, HMM_BLOCK_ALIGN_UNIT);
-
- if (n < MIN_BLOCK_BAUS)
- n = MIN_BLOCK_BAUS;
-
- {
- /* Search for the first node of the bin containing the smallest
- ** block big enough to satisfy request. */
- ptr_record *ptr_rec_ptr =
- U(avl_search)(
- (U(avl_avl) *) & (desc->avl_tree_root), (U(size_bau)) n,
- AVL_GREATER_EQUAL);
-
- /* If an approprate bin is found, satisfy the allocation request,
- ** otherwise return null pointer. */
- return(ptr_rec_ptr ?
- U(alloc_from_bin)(desc, ptr_rec_ptr, (U(size_bau)) n) : 0);
- }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c
deleted file mode 100644
index 0eff59d20e9..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(init)(U(descriptor) *desc) {
- desc->avl_tree_root = 0;
- desc->last_freed = 0;
-}
-
-/* Remove a free block from a bin's doubly-linked list when it is not,
-** the first block in the bin.
-*/
-void U(dll_remove)(
- /* Pointer to pointer record in the block to be removed. */
- ptr_record *to_remove) {
- to_remove->prev->next = to_remove->next;
-
- if (to_remove->next)
- to_remove->next->prev = to_remove->prev;
-}
-
-/* Put a block into the free collection of a heap.
-*/
-void U(into_free_collection)(
- /* Pointer to heap descriptor. */
- U(descriptor) *desc,
- /* Pointer to head record of block. */
- head_record *head_ptr) {
- ptr_record *ptr_rec_ptr = HEAD_TO_PTR_REC(head_ptr);
-
- ptr_record *bin_front_ptr =
- U(avl_insert)((U(avl_avl) *) & (desc->avl_tree_root), ptr_rec_ptr);
-
- if (bin_front_ptr != ptr_rec_ptr) {
- /* The block was not inserted into the AVL tree because there is
- ** already a bin for the size of the block. */
-
- MARK_SUCCESSIVE_BLOCK_IN_FREE_BIN(head_ptr)
- ptr_rec_ptr->self = ptr_rec_ptr;
-
- /* Make the block the new second block in the bin's doubly-linked
- ** list. */
- ptr_rec_ptr->prev = bin_front_ptr;
- ptr_rec_ptr->next = bin_front_ptr->next;
- bin_front_ptr->next = ptr_rec_ptr;
-
- if (ptr_rec_ptr->next)
- ptr_rec_ptr->next->prev = ptr_rec_ptr;
- } else
- /* Block is first block in new bin. */
- ptr_rec_ptr->next = 0;
-}
-
-/* Allocate a block from a given bin. Returns a pointer to the payload
-** of the removed block. The "last freed" pointer must be null prior
-** to calling this function.
-*/
-void *U(alloc_from_bin)(
- /* Pointer to heap descriptor. */
- U(descriptor) *desc,
- /* Pointer to pointer record of first block in bin. */
- ptr_record *bin_front_ptr,
- /* Number of BAUs needed in the allocated block. If the block taken
- ** from the bin is significantly larger than the number of BAUs needed,
- ** the "extra" BAUs are split off to form a new free block. */
- U(size_bau) n_baus) {
- head_record *head_ptr;
- U(size_bau) rem_baus;
-
- if (bin_front_ptr->next) {
- /* There are multiple blocks in this bin. Use the 2nd block in
- ** the bin to avoid needless change to the AVL tree.
- */
-
- ptr_record *ptr_rec_ptr = bin_front_ptr->next;
- head_ptr = PTR_REC_TO_HEAD(ptr_rec_ptr);
-
-#ifdef AUDIT_FAIL
- AUDIT_BLOCK(head_ptr)
-#endif
-
- U(dll_remove)(ptr_rec_ptr);
- } else {
- /* There is only one block in the bin, so it has to be removed
- ** from the AVL tree.
- */
-
- head_ptr = PTR_REC_TO_HEAD(bin_front_ptr);
-
- U(avl_remove)(
- (U(avl_avl) *) & (desc->avl_tree_root), BLOCK_BAUS(head_ptr));
- }
-
- MARK_BLOCK_ALLOCATED(head_ptr)
-
- rem_baus = BLOCK_BAUS(head_ptr) - n_baus;
-
- if (rem_baus >= MIN_BLOCK_BAUS) {
- /* Since there are enough "extra" BAUs, split them off to form
- ** a new free block.
- */
-
- head_record *rem_head_ptr =
- (head_record *) BAUS_FORWARD(head_ptr, n_baus);
-
- /* Change the next block's header to reflect the fact that the
- ** block preceeding it is now smaller.
- */
- SET_PREV_BLOCK_BAUS(
- BAUS_FORWARD(head_ptr, head_ptr->block_size), rem_baus)
-
- head_ptr->block_size = n_baus;
-
- rem_head_ptr->previous_block_size = n_baus;
- rem_head_ptr->block_size = rem_baus;
-
- desc->last_freed = rem_head_ptr;
- }
-
- return(HEAD_TO_PTR_REC(head_ptr));
-}
-
-/* Take a block out of the free collection.
-*/
-void U(out_of_free_collection)(
- /* Descriptor of heap that block is in. */
- U(descriptor) *desc,
- /* Pointer to head of block to take out of free collection. */
- head_record *head_ptr) {
- ptr_record *ptr_rec_ptr = HEAD_TO_PTR_REC(head_ptr);
-
- if (ptr_rec_ptr->self == ptr_rec_ptr)
- /* Block is not the front block in its bin, so all we have to
- ** do is take it out of the bin's doubly-linked list. */
- U(dll_remove)(ptr_rec_ptr);
- else {
- ptr_record *next = ptr_rec_ptr->next;
-
- if (next)
- /* Block is the front block in its bin, and there is at least
- ** one other block in the bin. Substitute the next block for
- ** the front block. */
- U(avl_subst)((U(avl_avl) *) & (desc->avl_tree_root), next);
- else
- /* Block is the front block in its bin, but there is no other
- ** block in the bin. Eliminate the bin. */
- U(avl_remove)(
- (U(avl_avl) *) & (desc->avl_tree_root), BLOCK_BAUS(head_ptr));
- }
-}
-
-void U(free)(U(descriptor) *desc, void *payload_ptr) {
- /* Flags if coalesce with adjacent block. */
- int coalesce;
-
- head_record *fwd_head_ptr;
- head_record *free_head_ptr = PTR_REC_TO_HEAD(payload_ptr);
-
- desc->num_baus_can_shrink = 0;
-
-#ifdef HMM_AUDIT_FAIL
-
- AUDIT_BLOCK(free_head_ptr)
-
- /* Make sure not freeing an already free block. */
- if (!IS_BLOCK_ALLOCATED(free_head_ptr))
- HMM_AUDIT_FAIL
-
- if (desc->avl_tree_root)
- /* Audit root block in AVL tree. */
- AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-
-#endif
-
- fwd_head_ptr =
- (head_record *) BAUS_FORWARD(free_head_ptr, free_head_ptr->block_size);
-
- if (free_head_ptr->previous_block_size) {
- /* Coalesce with backward block if possible. */
-
- head_record *bkwd_head_ptr =
- (head_record *) BAUS_BACKWARD(
- free_head_ptr, free_head_ptr->previous_block_size);
-
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(bkwd_head_ptr)
-#endif
-
- if (bkwd_head_ptr == (head_record *)(desc->last_freed)) {
- desc->last_freed = 0;
- coalesce = 1;
- } else if (IS_BLOCK_ALLOCATED(bkwd_head_ptr))
- coalesce = 0;
- else {
- U(out_of_free_collection)(desc, bkwd_head_ptr);
- coalesce = 1;
- }
-
- if (coalesce) {
- bkwd_head_ptr->block_size += free_head_ptr->block_size;
- SET_PREV_BLOCK_BAUS(fwd_head_ptr, BLOCK_BAUS(bkwd_head_ptr))
- free_head_ptr = bkwd_head_ptr;
- }
- }
-
- if (fwd_head_ptr->block_size == 0) {
- /* Block to be freed is last block before dummy end-of-chunk block. */
- desc->end_of_shrinkable_chunk =
- BAUS_FORWARD(fwd_head_ptr, DUMMY_END_BLOCK_BAUS);
- desc->num_baus_can_shrink = BLOCK_BAUS(free_head_ptr);
-
- if (PREV_BLOCK_BAUS(free_head_ptr) == 0)
- /* Free block is the entire chunk, so shrinking can eliminate
- ** entire chunk including dummy end block. */
- desc->num_baus_can_shrink += DUMMY_END_BLOCK_BAUS;
- } else {
- /* Coalesce with forward block if possible. */
-
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(fwd_head_ptr)
-#endif
-
- if (fwd_head_ptr == (head_record *)(desc->last_freed)) {
- desc->last_freed = 0;
- coalesce = 1;
- } else if (IS_BLOCK_ALLOCATED(fwd_head_ptr))
- coalesce = 0;
- else {
- U(out_of_free_collection)(desc, fwd_head_ptr);
- coalesce = 1;
- }
-
- if (coalesce) {
- free_head_ptr->block_size += fwd_head_ptr->block_size;
-
- fwd_head_ptr =
- (head_record *) BAUS_FORWARD(
- fwd_head_ptr, BLOCK_BAUS(fwd_head_ptr));
-
- SET_PREV_BLOCK_BAUS(fwd_head_ptr, BLOCK_BAUS(free_head_ptr))
-
- if (fwd_head_ptr->block_size == 0) {
- /* Coalesced block to be freed is last block before dummy
- ** end-of-chunk block. */
- desc->end_of_shrinkable_chunk =
- BAUS_FORWARD(fwd_head_ptr, DUMMY_END_BLOCK_BAUS);
- desc->num_baus_can_shrink = BLOCK_BAUS(free_head_ptr);
-
- if (PREV_BLOCK_BAUS(free_head_ptr) == 0)
- /* Free block is the entire chunk, so shrinking can
- ** eliminate entire chunk including dummy end block. */
- desc->num_baus_can_shrink += DUMMY_END_BLOCK_BAUS;
- }
- }
- }
-
- if (desc->last_freed) {
- /* There is a last freed block, but it is not adjacent to the
- ** block being freed by this call to free, so put the last
- ** freed block into the free collection.
- */
-
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(desc->last_freed)
-#endif
-
- U(into_free_collection)(desc, (head_record *)(desc->last_freed));
- }
-
- desc->last_freed = free_head_ptr;
-}
-
-void U(new_chunk)(U(descriptor) *desc, void *start, U(size_bau) n_baus) {
-#ifdef HMM_AUDIT_FAIL
-
- if (desc->avl_tree_root)
- /* Audit root block in AVL tree. */
- AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-#undef HEAD_PTR
-#define HEAD_PTR ((head_record *) start)
-
- /* Make the chunk one big free block followed by a dummy end block.
- */
-
- n_baus -= DUMMY_END_BLOCK_BAUS;
-
- HEAD_PTR->previous_block_size = 0;
- HEAD_PTR->block_size = n_baus;
-
- U(into_free_collection)(desc, HEAD_PTR);
-
- /* Set up the dummy end block. */
- start = BAUS_FORWARD(start, n_baus);
- HEAD_PTR->previous_block_size = n_baus;
- HEAD_PTR->block_size = 0;
-
-#undef HEAD_PTR
-}
-
-#ifdef HMM_AUDIT_FAIL
-
-/* Function that does audit fail actions defined my preprocessor symbol,
-** and returns a dummy integer value.
-*/
-int U(audit_block_fail_dummy_return)(void) {
- HMM_AUDIT_FAIL
-
- /* Dummy return. */
- return(0);
-}
-
-#endif
-
-/* AVL Tree instantiation. */
-
-#ifdef HMM_AUDIT_FAIL
-
-/* The AVL tree generic package passes an ACCESS of 1 when it "touches"
-** a child node for the first time during a particular operation. I use
-** this feature to audit only one time (per operation) the free blocks
-** that are tree nodes. Since the root node is not a child node, it has
-** to be audited directly.
-*/
-
-/* The pain you feel while reading these macros will not be in vain. It
-** will remove all doubt from you mind that C++ inline functions are
-** a very good thing.
-*/
-
-#define AVL_GET_LESS(H, ACCESS) \
- (((ACCESS) ? AUDIT_BLOCK_AS_EXPR(PTR_REC_TO_HEAD(H)) : 0), (H)->self)
-#define AVL_GET_GREATER(H, ACCESS) \
- (((ACCESS) ? AUDIT_BLOCK_AS_EXPR(PTR_REC_TO_HEAD(H)) : 0), (H)->prev)
-
-#else
-
-#define AVL_GET_LESS(H, ACCESS) ((H)->self)
-#define AVL_GET_GREATER(H, ACCESS) ((H)->prev)
-
-#endif
-
-#define AVL_SET_LESS(H, LH) (H)->self = (LH);
-#define AVL_SET_GREATER(H, GH) (H)->prev = (GH);
-
-/* high bit of high bit of
-** block_size previous_block_size balance factor
-** ----------- ------------------- --------------
-** 0 0 n/a (block allocated)
-** 0 1 1
-** 1 0 -1
-** 1 1 0
-*/
-
-#define AVL_GET_BALANCE_FACTOR(H) \
- ((((head_record *) (PTR_REC_TO_HEAD(H)))->block_size & \
- HIGH_BIT_BAU_SIZE) ? \
- (((head_record *) (PTR_REC_TO_HEAD(H)))->previous_block_size & \
- HIGH_BIT_BAU_SIZE ? 0 : -1) : 1)
-
-#define AVL_SET_BALANCE_FACTOR(H, BF) \
- { \
- register head_record *p = \
- (head_record *) PTR_REC_TO_HEAD(H); \
- register int bal_f = (BF); \
- \
- if (bal_f <= 0) \
- p->block_size |= HIGH_BIT_BAU_SIZE; \
- else \
- p->block_size &= ~HIGH_BIT_BAU_SIZE; \
- if (bal_f >= 0) \
- p->previous_block_size |= HIGH_BIT_BAU_SIZE; \
- else \
- p->previous_block_size &= ~HIGH_BIT_BAU_SIZE; \
- }
-
-#define COMPARE_KEY_KEY(K1, K2) ((K1) == (K2) ? 0 : ((K1) > (K2) ? 1 : -1))
-
-#define AVL_COMPARE_KEY_NODE(K, H) \
- COMPARE_KEY_KEY(K, BLOCK_BAUS(PTR_REC_TO_HEAD(H)))
-
-#define AVL_COMPARE_NODE_NODE(H1, H2) \
- COMPARE_KEY_KEY(BLOCK_BAUS(PTR_REC_TO_HEAD(H1)), \
- BLOCK_BAUS(PTR_REC_TO_HEAD(H2)))
-
-#define AVL_NULL ((ptr_record *) 0)
-
-#define AVL_IMPL_MASK \
- ( AVL_IMPL_INSERT | AVL_IMPL_SEARCH | AVL_IMPL_REMOVE | AVL_IMPL_SUBST )
-
-#include "cavl_impl.h"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c
deleted file mode 100644
index 51c3cc27a4a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-/* The function in this file performs default actions if self-auditing
-** finds heap corruption. Don't rely on this code to handle the
-** case where HMM is being used to implement the malloc and free standard
-** library functions. Rewrite the function if necessary to avoid using
-** I/O and execution termination functions that call malloc or free.
-** In Unix, for example, you would replace the fputs calls with calls
-** to the write system call using file handle number 2.
-*/
-#include "hmm_intrnl.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-static int entered = 0;
-
-/* Print abort message, file and line. Terminate execution.
-*/
-void hmm_dflt_abort(const char *file, const char *line) {
- /* Avoid use of printf(), which is more likely to use heap. */
-
- if (entered)
-
- /* The standard I/O functions called a heap function and caused
- ** an indirect recursive call to this function. So we'll have
- ** to just exit without printing a message. */
- while (1);
-
- entered = 1;
-
- fputs("\n_abort - Heap corruption\n" "File: ", stderr);
- fputs(file, stderr);
- fputs(" Line: ", stderr);
- fputs(line, stderr);
- fputs("\n\n", stderr);
- fputs("hmm_dflt_abort: while(1)!!!\n", stderr);
- fflush(stderr);
-
- while (1);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c
deleted file mode 100644
index 0e86373748a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(grow_chunk)(U(descriptor) *desc, void *end, U(size_bau) n_baus) {
-#undef HEAD_PTR
-#define HEAD_PTR ((head_record *) end)
-
- end = BAUS_BACKWARD(end, DUMMY_END_BLOCK_BAUS);
-
-#ifdef HMM_AUDIT_FAIL
-
- if (HEAD_PTR->block_size != 0)
- /* Chunk does not have valid dummy end block. */
- HMM_AUDIT_FAIL
-
-#endif
-
- /* Create a new block that absorbs the old dummy end block. */
- HEAD_PTR->block_size = n_baus;
-
- /* Set up the new dummy end block. */
- {
- head_record *dummy = (head_record *) BAUS_FORWARD(end, n_baus);
- dummy->previous_block_size = n_baus;
- dummy->block_size = 0;
- }
-
- /* Simply free the new block, allowing it to coalesce with any
- ** free block at that was the last block in the chunk prior to
- ** growth.
- */
- U(free)(desc, HEAD_TO_PTR_REC(end));
-
-#undef HEAD_PTR
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c
deleted file mode 100644
index 192758df909..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-U(size_aau) U(largest_available)(U(descriptor) *desc) {
- U(size_bau) largest;
-
- if (!(desc->avl_tree_root))
- largest = 0;
- else {
-#ifdef HMM_AUDIT_FAIL
- /* Audit root block in AVL tree. */
- AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
- largest =
- BLOCK_BAUS(
- PTR_REC_TO_HEAD(
- U(avl_search)(
- (U(avl_avl) *) & (desc->avl_tree_root),
- (U(size_bau)) ~(U(size_bau)) 0, AVL_LESS)));
- }
-
- if (desc->last_freed) {
- /* Size of last freed block. */
- register U(size_bau) lf_size;
-
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(desc->last_freed)
-#endif
-
- lf_size = BLOCK_BAUS(desc->last_freed);
-
- if (lf_size > largest)
- largest = lf_size;
- }
-
- /* Convert largest size to AAUs and subract head size leaving payload
- ** size.
- */
- return(largest ?
- ((largest * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT)) - HEAD_AAUS) :
- 0);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c
deleted file mode 100644
index baa5a8f9eda..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-int U(resize)(U(descriptor) *desc, void *mem, U(size_aau) n) {
- U(size_aau) i;
- head_record *next_head_ptr;
- head_record *head_ptr = PTR_REC_TO_HEAD(mem);
-
- /* Flag. */
- int next_block_free;
-
- /* Convert n from desired block size in AAUs to BAUs. */
- n += HEAD_AAUS;
- n = DIV_ROUND_UP(n, HMM_BLOCK_ALIGN_UNIT);
-
- if (n < MIN_BLOCK_BAUS)
- n = MIN_BLOCK_BAUS;
-
-#ifdef HMM_AUDIT_FAIL
-
- AUDIT_BLOCK(head_ptr)
-
- if (!IS_BLOCK_ALLOCATED(head_ptr))
- HMM_AUDIT_FAIL
-
- if (desc->avl_tree_root)
- AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-
-#endif
-
- i = head_ptr->block_size;
-
- next_head_ptr =
- (head_record *) BAUS_FORWARD(head_ptr, head_ptr->block_size);
-
- next_block_free =
- (next_head_ptr == desc->last_freed) ||
- !IS_BLOCK_ALLOCATED(next_head_ptr);
-
- if (next_block_free)
- /* Block can expand into next free block. */
- i += BLOCK_BAUS(next_head_ptr);
-
- if (n > i)
- /* Not enough room for block to expand. */
- return(-1);
-
- if (next_block_free) {
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(next_head_ptr)
-#endif
-
- if (next_head_ptr == desc->last_freed)
- desc->last_freed = 0;
- else
- U(out_of_free_collection)(desc, next_head_ptr);
-
- next_head_ptr =
- (head_record *) BAUS_FORWARD(head_ptr, (U(size_bau)) i);
- }
-
- /* Set i to number of "extra" BAUs. */
- i -= n;
-
- if (i < MIN_BLOCK_BAUS)
- /* Not enough extra BAUs to be a block on their own, so just keep them
- ** in the block being resized.
- */
- {
- n += i;
- i = n;
- } else {
- /* There are enough "leftover" BAUs in the next block to
- ** form a remainder block. */
-
- head_record *rem_head_ptr;
-
- rem_head_ptr = (head_record *) BAUS_FORWARD(head_ptr, n);
-
- rem_head_ptr->previous_block_size = (U(size_bau)) n;
- rem_head_ptr->block_size = (U(size_bau)) i;
-
- if (desc->last_freed) {
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(desc->last_freed)
-#endif
-
- U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-
- desc->last_freed = 0;
- }
-
- desc->last_freed = rem_head_ptr;
- }
-
- head_ptr->block_size = (U(size_bau)) n;
- next_head_ptr->previous_block_size = (U(size_bau)) i;
-
- return(0);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c
deleted file mode 100644
index f80aeead7a0..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(shrink_chunk)(U(descriptor) *desc, U(size_bau) n_baus_to_shrink) {
- head_record *dummy_end_block = (head_record *)
- BAUS_BACKWARD(desc->end_of_shrinkable_chunk, DUMMY_END_BLOCK_BAUS);
-
-#ifdef HMM_AUDIT_FAIL
-
- if (dummy_end_block->block_size != 0)
- /* Chunk does not have valid dummy end block. */
- HMM_AUDIT_FAIL
-
-#endif
-
- if (n_baus_to_shrink) {
- head_record *last_block = (head_record *)
- BAUS_BACKWARD(
- dummy_end_block, dummy_end_block->previous_block_size);
-
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(last_block)
-#endif
-
- if (last_block == desc->last_freed) {
- U(size_bau) bs = BLOCK_BAUS(last_block);
-
- /* Chunk will not be shrunk out of existence if
- ** 1. There is at least one allocated block in the chunk
- ** and the amount to shrink is exactly the size of the
- ** last block, OR
- ** 2. After the last block is shrunk, there will be enough
- ** BAUs left in it to form a minimal size block. */
- int chunk_will_survive =
- (PREV_BLOCK_BAUS(last_block) && (n_baus_to_shrink == bs)) ||
- (n_baus_to_shrink <= (U(size_bau))(bs - MIN_BLOCK_BAUS));
-
- if (chunk_will_survive ||
- (!PREV_BLOCK_BAUS(last_block) &&
- (n_baus_to_shrink ==
- (U(size_bau))(bs + DUMMY_END_BLOCK_BAUS)))) {
- desc->last_freed = 0;
-
- if (chunk_will_survive) {
- bs -= n_baus_to_shrink;
-
- if (bs) {
- /* The last (non-dummy) block was not completely
- ** eliminated by the shrink. */
-
- last_block->block_size = bs;
-
- /* Create new dummy end record.
- */
- dummy_end_block =
- (head_record *) BAUS_FORWARD(last_block, bs);
- dummy_end_block->previous_block_size = bs;
- dummy_end_block->block_size = 0;
-
-#ifdef HMM_AUDIT_FAIL
-
- if (desc->avl_tree_root)
- AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
- U(into_free_collection)(desc, last_block);
- } else {
- /* The last (non-dummy) block was completely
- ** eliminated by the shrink. Make its head
- ** the new dummy end block.
- */
- last_block->block_size = 0;
- last_block->previous_block_size &= ~HIGH_BIT_BAU_SIZE;
- }
- }
- }
-
-#ifdef HMM_AUDIT_FAIL
- else
- HMM_AUDIT_FAIL
-#endif
- }
-
-#ifdef HMM_AUDIT_FAIL
- else
- HMM_AUDIT_FAIL
-#endif
- }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c
deleted file mode 100644
index 4428c3e34a2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-U(size_aau) U(true_size)(void *payload_ptr) {
- register head_record *head_ptr = PTR_REC_TO_HEAD(payload_ptr);
-
-#ifdef HMM_AUDIT_FAIL
- AUDIT_BLOCK(head_ptr)
-#endif
-
- /* Convert block size from BAUs to AAUs. Subtract head size, leaving
- ** payload size.
- */
- return(
- (BLOCK_BAUS(head_ptr) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT)) -
- HEAD_AAUS);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h
deleted file mode 100644
index a5ced8bb7b1..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
-
-/* Abstract AVL Tree Generic C Package.
-** Interface generation header file.
-**
-** This code is in the public domain. See cavl_tree.html for interface
-** documentation.
-**
-** Version: 1.5 Author: Walt Karas
-*/
-
-/* This header contains the definition of CHAR_BIT (number of bits in a
-** char). */
-#include <limits.h>
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_SC
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-
-#ifndef AVL_SEARCH_TYPE_DEFINED_
-#define AVL_SEARCH_TYPE_DEFINED_
-
-typedef enum {
- AVL_EQUAL = 1,
- AVL_LESS = 2,
- AVL_GREATER = 4,
- AVL_LESS_EQUAL = AVL_EQUAL | AVL_LESS,
- AVL_GREATER_EQUAL = AVL_EQUAL | AVL_GREATER
-}
-avl_search_type;
-
-#endif
-
-#ifdef AVL_UNIQUE
-
-#define L_ AVL_UNIQUE
-
-#else
-
-#define L_(X) X
-
-#endif
-
-/* Determine storage class for function prototypes. */
-#ifdef AVL_PRIVATE
-
-#define L_SC static
-
-#else
-
-#define L_SC extern
-
-#endif
-
-#ifdef AVL_SIZE
-
-#define L_SIZE AVL_SIZE
-
-#else
-
-#define L_SIZE unsigned long
-
-#endif
-
-typedef struct {
-#ifdef AVL_INSIDE_STRUCT
-
- AVL_INSIDE_STRUCT
-
-#endif
-
- AVL_HANDLE root;
-}
-L_(avl);
-
-/* Function prototypes. */
-
-L_SC void L_(init)(L_(avl) *tree);
-
-L_SC int L_(is_empty)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(insert)(L_(avl) *tree, AVL_HANDLE h);
-
-L_SC AVL_HANDLE L_(search)(L_(avl) *tree, AVL_KEY k, avl_search_type st);
-
-L_SC AVL_HANDLE L_(search_least)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(search_greatest)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(remove)(L_(avl) *tree, AVL_KEY k);
-
-L_SC AVL_HANDLE L_(subst)(L_(avl) *tree, AVL_HANDLE new_node);
-
-#ifdef AVL_BUILD_ITER_TYPE
-
-L_SC int L_(build)(
- L_(avl) *tree, AVL_BUILD_ITER_TYPE p, L_SIZE num_nodes);
-
-#endif
-
-/* ANSI C/ISO C++ require that a long have at least 32 bits. Set
-** L_EST_LONG_BIT to be the greatest multiple of 8 in the range
-** 32 - 64 (inclusive) that is less than or equal to the number of
-** bits in a long.
-*/
-
-#if (((LONG_MAX >> 31) >> 7) == 0)
-
-#define L_EST_LONG_BIT 32
-
-#elif (((LONG_MAX >> 31) >> 15) == 0)
-
-#define L_EST_LONG_BIT 40
-
-#elif (((LONG_MAX >> 31) >> 23) == 0)
-
-#define L_EST_LONG_BIT 48
-
-#elif (((LONG_MAX >> 31) >> 31) == 0)
-
-#define L_EST_LONG_BIT 56
-
-#else
-
-#define L_EST_LONG_BIT 64
-
-#endif
-
-/* Number of bits in a long. */
-#define L_LONG_BIT (sizeof(long) * CHAR_BIT)
-
-/* The macro L_BIT_ARR_DEFN defines a bit array whose index is a (0-based)
-** node depth. The definition depends on whether the maximum depth is more
-** or less than the number of bits in a single long.
-*/
-
-#if ((AVL_MAX_DEPTH) > L_EST_LONG_BIT)
-
-/* Maximum depth may be more than number of bits in a long. */
-
-#define L_BIT_ARR_DEFN(NAME) \
- unsigned long NAME[((AVL_MAX_DEPTH) + L_LONG_BIT - 1) / L_LONG_BIT];
-
-#else
-
-/* Maximum depth is definitely less than number of bits in a long. */
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME;
-
-#endif
-
-/* Iterator structure. */
-typedef struct {
- /* Tree being iterated over. */
- L_(avl) *tree_;
-
- /* Records a path into the tree. If bit n is true, indicates
- ** take greater branch from the nth node in the path, otherwise
- ** take the less branch. bit 0 gives branch from root, and
- ** so on. */
- L_BIT_ARR_DEFN(branch)
-
- /* Zero-based depth of path into tree. */
- unsigned depth;
-
- /* Handles of nodes in path from root to current node (returned by *). */
- AVL_HANDLE path_h[(AVL_MAX_DEPTH) - 1];
-}
-L_(iter);
-
-/* Iterator function prototypes. */
-
-L_SC void L_(start_iter)(
- L_(avl) *tree, L_(iter) *iter, AVL_KEY k, avl_search_type st);
-
-L_SC void L_(start_iter_least)(L_(avl) *tree, L_(iter) *iter);
-
-L_SC void L_(start_iter_greatest)(L_(avl) *tree, L_(iter) *iter);
-
-L_SC AVL_HANDLE L_(get_iter)(L_(iter) *iter);
-
-L_SC void L_(incr_iter)(L_(iter) *iter);
-
-L_SC void L_(decr_iter)(L_(iter) *iter);
-
-L_SC void L_(init_iter)(L_(iter) *iter);
-
-#define AVL_IMPL_INIT 1
-#define AVL_IMPL_IS_EMPTY (1 << 1)
-#define AVL_IMPL_INSERT (1 << 2)
-#define AVL_IMPL_SEARCH (1 << 3)
-#define AVL_IMPL_SEARCH_LEAST (1 << 4)
-#define AVL_IMPL_SEARCH_GREATEST (1 << 5)
-#define AVL_IMPL_REMOVE (1 << 6)
-#define AVL_IMPL_BUILD (1 << 7)
-#define AVL_IMPL_START_ITER (1 << 8)
-#define AVL_IMPL_START_ITER_LEAST (1 << 9)
-#define AVL_IMPL_START_ITER_GREATEST (1 << 10)
-#define AVL_IMPL_GET_ITER (1 << 11)
-#define AVL_IMPL_INCR_ITER (1 << 12)
-#define AVL_IMPL_DECR_ITER (1 << 13)
-#define AVL_IMPL_INIT_ITER (1 << 14)
-#define AVL_IMPL_SUBST (1 << 15)
-
-#define AVL_IMPL_ALL (~0)
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_SC
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-
-#endif // VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h
deleted file mode 100644
index 8b9ae27a8c9..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h
+++ /dev/null
@@ -1,1152 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
-
-/* Abstract AVL Tree Generic C Package.
-** Implementation generation header file.
-**
-** This code is in the public domain. See cavl_tree.html for interface
-** documentation.
-**
-** Version: 1.5 Author: Walt Karas
-*/
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef l_tree
-#undef L_MASK_HIGH_BIT
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-#undef L_BIT_ARR_VAL
-#undef L_BIT_ARR_0
-#undef L_BIT_ARR_1
-#undef L_BIT_ARR_ALL
-#undef L_BIT_ARR_LONGS
-#undef L_IMPL_MASK
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_SC
-#undef L_BALANCE_PARAM_PREFIX
-
-#ifdef AVL_UNIQUE
-
-#define L_ AVL_UNIQUE
-
-#else
-
-#define L_(X) X
-
-#endif
-
-/* Determine correct storage class for functions */
-#ifdef AVL_PRIVATE
-
-#define L_SC static
-
-#else
-
-#define L_SC
-
-#endif
-
-#ifdef AVL_SIZE
-
-#define L_SIZE AVL_SIZE
-
-#else
-
-#define L_SIZE unsigned long
-
-#endif
-
-#define L_MASK_HIGH_BIT ((int) ~ ((~ (unsigned) 0) >> 1))
-
-/* ANSI C/ISO C++ require that a long have at least 32 bits. Set
-** L_EST_LONG_BIT to be the greatest multiple of 8 in the range
-** 32 - 64 (inclusive) that is less than or equal to the number of
-** bits in a long.
-*/
-
-#if (((LONG_MAX >> 31) >> 7) == 0)
-
-#define L_EST_LONG_BIT 32
-
-#elif (((LONG_MAX >> 31) >> 15) == 0)
-
-#define L_EST_LONG_BIT 40
-
-#elif (((LONG_MAX >> 31) >> 23) == 0)
-
-#define L_EST_LONG_BIT 48
-
-#elif (((LONG_MAX >> 31) >> 31) == 0)
-
-#define L_EST_LONG_BIT 56
-
-#else
-
-#define L_EST_LONG_BIT 64
-
-#endif
-
-#define L_LONG_BIT (sizeof(long) * CHAR_BIT)
-
-#if ((AVL_MAX_DEPTH) > L_EST_LONG_BIT)
-
-/* The maximum depth may be greater than the number of bits in a long,
-** so multiple longs are needed to hold a bit array indexed by node
-** depth. */
-
-#define L_BIT_ARR_LONGS (((AVL_MAX_DEPTH) + L_LONG_BIT - 1) / L_LONG_BIT)
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME[L_BIT_ARR_LONGS];
-
-#define L_BIT_ARR_VAL(BIT_ARR, BIT_NUM) \
- ((BIT_ARR)[(BIT_NUM) / L_LONG_BIT] & (1L << ((BIT_NUM) % L_LONG_BIT)))
-
-#define L_BIT_ARR_0(BIT_ARR, BIT_NUM) \
- (BIT_ARR)[(BIT_NUM) / L_LONG_BIT] &= ~(1L << ((BIT_NUM) % L_LONG_BIT));
-
-#define L_BIT_ARR_1(BIT_ARR, BIT_NUM) \
- (BIT_ARR)[(BIT_NUM) / L_LONG_BIT] |= 1L << ((BIT_NUM) % L_LONG_BIT);
-
-#define L_BIT_ARR_ALL(BIT_ARR, BIT_VAL) \
- { int i = L_BIT_ARR_LONGS; do (BIT_ARR)[--i] = 0L - (BIT_VAL); while(i); }
-
-#else /* The bit array can definitely fit in one long */
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME;
-
-#define L_BIT_ARR_VAL(BIT_ARR, BIT_NUM) ((BIT_ARR) & (1L << (BIT_NUM)))
-
-#define L_BIT_ARR_0(BIT_ARR, BIT_NUM) (BIT_ARR) &= ~(1L << (BIT_NUM));
-
-#define L_BIT_ARR_1(BIT_ARR, BIT_NUM) (BIT_ARR) |= 1L << (BIT_NUM);
-
-#define L_BIT_ARR_ALL(BIT_ARR, BIT_VAL) (BIT_ARR) = 0L - (BIT_VAL);
-
-#endif
-
-#ifdef AVL_READ_ERRORS_HAPPEN
-
-#define L_CHECK_READ_ERROR(ERROR_RETURN) \
- { if (AVL_READ_ERROR) return(ERROR_RETURN); }
-
-#else
-
-#define L_CHECK_READ_ERROR(ERROR_RETURN)
-
-#endif
-
-/* The presumed reason that an instantiation places additional fields
-** inside the AVL tree structure is that the SET_ and GET_ macros
-** need these fields. The "balance" function does not explicitly use
-** any fields in the AVL tree structure, so only pass an AVL tree
-** structure pointer to "balance" if it has instantiation-specific
-** fields that are (presumably) needed by the SET_/GET_ calls within
-** "balance".
-*/
-#ifdef AVL_INSIDE_STRUCT
-
-#define L_BALANCE_PARAM_CALL_PREFIX l_tree,
-#define L_BALANCE_PARAM_DECL_PREFIX L_(avl) *l_tree,
-
-#else
-
-#define L_BALANCE_PARAM_CALL_PREFIX
-#define L_BALANCE_PARAM_DECL_PREFIX
-
-#endif
-
-#ifdef AVL_IMPL_MASK
-
-#define L_IMPL_MASK (AVL_IMPL_MASK)
-
-#else
-
-/* Define all functions. */
-#define L_IMPL_MASK AVL_IMPL_ALL
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INIT)
-
-L_SC void L_(init)(L_(avl) *l_tree) {
- l_tree->root = AVL_NULL;
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_IS_EMPTY)
-
-L_SC int L_(is_empty)(L_(avl) *l_tree) {
- return(l_tree->root == AVL_NULL);
-}
-
-#endif
-
-/* Put the private balance function in the same compilation module as
-** the insert function. */
-#if (L_IMPL_MASK & AVL_IMPL_INSERT)
-
-/* Balances subtree, returns handle of root node of subtree after balancing.
-*/
-L_SC AVL_HANDLE L_(balance)(L_BALANCE_PARAM_DECL_PREFIX AVL_HANDLE bal_h) {
- AVL_HANDLE deep_h;
-
- /* Either the "greater than" or the "less than" subtree of
- ** this node has to be 2 levels deeper (or else it wouldn't
- ** need balancing).
- */
- if (AVL_GET_BALANCE_FACTOR(bal_h) > 0) {
- /* "Greater than" subtree is deeper. */
-
- deep_h = AVL_GET_GREATER(bal_h, 1);
-
- L_CHECK_READ_ERROR(AVL_NULL)
-
- if (AVL_GET_BALANCE_FACTOR(deep_h) < 0) {
- int bf;
-
- AVL_HANDLE old_h = bal_h;
- bal_h = AVL_GET_LESS(deep_h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
- AVL_SET_GREATER(old_h, AVL_GET_LESS(bal_h, 1))
- AVL_SET_LESS(deep_h, AVL_GET_GREATER(bal_h, 1))
- AVL_SET_LESS(bal_h, old_h)
- AVL_SET_GREATER(bal_h, deep_h)
-
- bf = AVL_GET_BALANCE_FACTOR(bal_h);
-
- if (bf != 0) {
- if (bf > 0) {
- AVL_SET_BALANCE_FACTOR(old_h, -1)
- AVL_SET_BALANCE_FACTOR(deep_h, 0)
- } else {
- AVL_SET_BALANCE_FACTOR(deep_h, 1)
- AVL_SET_BALANCE_FACTOR(old_h, 0)
- }
-
- AVL_SET_BALANCE_FACTOR(bal_h, 0)
- } else {
- AVL_SET_BALANCE_FACTOR(old_h, 0)
- AVL_SET_BALANCE_FACTOR(deep_h, 0)
- }
- } else {
- AVL_SET_GREATER(bal_h, AVL_GET_LESS(deep_h, 0))
- AVL_SET_LESS(deep_h, bal_h)
-
- if (AVL_GET_BALANCE_FACTOR(deep_h) == 0) {
- AVL_SET_BALANCE_FACTOR(deep_h, -1)
- AVL_SET_BALANCE_FACTOR(bal_h, 1)
- } else {
- AVL_SET_BALANCE_FACTOR(deep_h, 0)
- AVL_SET_BALANCE_FACTOR(bal_h, 0)
- }
-
- bal_h = deep_h;
- }
- } else {
- /* "Less than" subtree is deeper. */
-
- deep_h = AVL_GET_LESS(bal_h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
-
- if (AVL_GET_BALANCE_FACTOR(deep_h) > 0) {
- int bf;
- AVL_HANDLE old_h = bal_h;
- bal_h = AVL_GET_GREATER(deep_h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
- AVL_SET_LESS(old_h, AVL_GET_GREATER(bal_h, 0))
- AVL_SET_GREATER(deep_h, AVL_GET_LESS(bal_h, 0))
- AVL_SET_GREATER(bal_h, old_h)
- AVL_SET_LESS(bal_h, deep_h)
-
- bf = AVL_GET_BALANCE_FACTOR(bal_h);
-
- if (bf != 0) {
- if (bf < 0) {
- AVL_SET_BALANCE_FACTOR(old_h, 1)
- AVL_SET_BALANCE_FACTOR(deep_h, 0)
- } else {
- AVL_SET_BALANCE_FACTOR(deep_h, -1)
- AVL_SET_BALANCE_FACTOR(old_h, 0)
- }
-
- AVL_SET_BALANCE_FACTOR(bal_h, 0)
- } else {
- AVL_SET_BALANCE_FACTOR(old_h, 0)
- AVL_SET_BALANCE_FACTOR(deep_h, 0)
- }
- } else {
- AVL_SET_LESS(bal_h, AVL_GET_GREATER(deep_h, 0))
- AVL_SET_GREATER(deep_h, bal_h)
-
- if (AVL_GET_BALANCE_FACTOR(deep_h) == 0) {
- AVL_SET_BALANCE_FACTOR(deep_h, 1)
- AVL_SET_BALANCE_FACTOR(bal_h, -1)
- } else {
- AVL_SET_BALANCE_FACTOR(deep_h, 0)
- AVL_SET_BALANCE_FACTOR(bal_h, 0)
- }
-
- bal_h = deep_h;
- }
- }
-
- return(bal_h);
-}
-
-L_SC AVL_HANDLE L_(insert)(L_(avl) *l_tree, AVL_HANDLE h) {
- AVL_SET_LESS(h, AVL_NULL)
- AVL_SET_GREATER(h, AVL_NULL)
- AVL_SET_BALANCE_FACTOR(h, 0)
-
- if (l_tree->root == AVL_NULL)
- l_tree->root = h;
- else {
- /* Last unbalanced node encountered in search for insertion point. */
- AVL_HANDLE unbal = AVL_NULL;
- /* Parent of last unbalanced node. */
- AVL_HANDLE parent_unbal = AVL_NULL;
- /* Balance factor of last unbalanced node. */
- int unbal_bf;
-
- /* Zero-based depth in tree. */
- unsigned depth = 0, unbal_depth = 0;
-
- /* Records a path into the tree. If bit n is true, indicates
- ** take greater branch from the nth node in the path, otherwise
- ** take the less branch. bit 0 gives branch from root, and
- ** so on. */
- L_BIT_ARR_DEFN(branch)
-
- AVL_HANDLE hh = l_tree->root;
- AVL_HANDLE parent = AVL_NULL;
- int cmp;
-
- do {
- if (AVL_GET_BALANCE_FACTOR(hh) != 0) {
- unbal = hh;
- parent_unbal = parent;
- unbal_depth = depth;
- }
-
- cmp = AVL_COMPARE_NODE_NODE(h, hh);
-
- if (cmp == 0)
- /* Duplicate key. */
- return(hh);
-
- parent = hh;
-
- if (cmp > 0) {
- hh = AVL_GET_GREATER(hh, 1);
- L_BIT_ARR_1(branch, depth)
- } else {
- hh = AVL_GET_LESS(hh, 1);
- L_BIT_ARR_0(branch, depth)
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- depth++;
- } while (hh != AVL_NULL);
-
- /* Add node to insert as leaf of tree. */
- if (cmp < 0)
- AVL_SET_LESS(parent, h)
- else
- AVL_SET_GREATER(parent, h)
-
- depth = unbal_depth;
-
- if (unbal == AVL_NULL)
- hh = l_tree->root;
- else {
- cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
- depth++;
- unbal_bf = AVL_GET_BALANCE_FACTOR(unbal);
-
- if (cmp < 0)
- unbal_bf--;
- else /* cmp > 0 */
- unbal_bf++;
-
- hh = cmp < 0 ? AVL_GET_LESS(unbal, 1) : AVL_GET_GREATER(unbal, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
-
- if ((unbal_bf != -2) && (unbal_bf != 2)) {
- /* No rebalancing of tree is necessary. */
- AVL_SET_BALANCE_FACTOR(unbal, unbal_bf)
- unbal = AVL_NULL;
- }
- }
-
- if (hh != AVL_NULL)
- while (h != hh) {
- cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
- depth++;
-
- if (cmp < 0) {
- AVL_SET_BALANCE_FACTOR(hh, -1)
- hh = AVL_GET_LESS(hh, 1);
- } else { /* cmp > 0 */
- AVL_SET_BALANCE_FACTOR(hh, 1)
- hh = AVL_GET_GREATER(hh, 1);
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- }
-
- if (unbal != AVL_NULL) {
- unbal = L_(balance)(L_BALANCE_PARAM_CALL_PREFIX unbal);
- L_CHECK_READ_ERROR(AVL_NULL)
-
- if (parent_unbal == AVL_NULL)
- l_tree->root = unbal;
- else {
- depth = unbal_depth - 1;
- cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-
- if (cmp < 0)
- AVL_SET_LESS(parent_unbal, unbal)
- else /* cmp > 0 */
- AVL_SET_GREATER(parent_unbal, unbal)
- }
- }
-
- }
-
- return(h);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH)
-
-L_SC AVL_HANDLE L_(search)(L_(avl) *l_tree, AVL_KEY k, avl_search_type st) {
- int cmp, target_cmp;
- AVL_HANDLE match_h = AVL_NULL;
- AVL_HANDLE h = l_tree->root;
-
- if (st & AVL_LESS)
- target_cmp = 1;
- else if (st & AVL_GREATER)
- target_cmp = -1;
- else
- target_cmp = 0;
-
- while (h != AVL_NULL) {
- cmp = AVL_COMPARE_KEY_NODE(k, h);
-
- if (cmp == 0) {
- if (st & AVL_EQUAL) {
- match_h = h;
- break;
- }
-
- cmp = -target_cmp;
- } else if (target_cmp != 0)
- if (!((cmp ^ target_cmp) & L_MASK_HIGH_BIT))
- /* cmp and target_cmp are both positive or both negative. */
- match_h = h;
-
- h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
- }
-
- return(match_h);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH_LEAST)
-
-L_SC AVL_HANDLE L_(search_least)(L_(avl) *l_tree) {
- AVL_HANDLE h = l_tree->root;
- AVL_HANDLE parent = AVL_NULL;
-
- while (h != AVL_NULL) {
- parent = h;
- h = AVL_GET_LESS(h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
- }
-
- return(parent);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH_GREATEST)
-
-L_SC AVL_HANDLE L_(search_greatest)(L_(avl) *l_tree) {
- AVL_HANDLE h = l_tree->root;
- AVL_HANDLE parent = AVL_NULL;
-
- while (h != AVL_NULL) {
- parent = h;
- h = AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
- }
-
- return(parent);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_REMOVE)
-
-/* Prototype of balance function (called by remove) in case not in
-** same compilation unit.
-*/
-L_SC AVL_HANDLE L_(balance)(L_BALANCE_PARAM_DECL_PREFIX AVL_HANDLE bal_h);
-
-L_SC AVL_HANDLE L_(remove)(L_(avl) *l_tree, AVL_KEY k) {
- /* Zero-based depth in tree. */
- unsigned depth = 0, rm_depth;
-
- /* Records a path into the tree. If bit n is true, indicates
- ** take greater branch from the nth node in the path, otherwise
- ** take the less branch. bit 0 gives branch from root, and
- ** so on. */
- L_BIT_ARR_DEFN(branch)
-
- AVL_HANDLE h = l_tree->root;
- AVL_HANDLE parent = AVL_NULL;
- AVL_HANDLE child;
- AVL_HANDLE path;
- int cmp, cmp_shortened_sub_with_path;
- int reduced_depth;
- int bf;
- AVL_HANDLE rm;
- AVL_HANDLE parent_rm;
-
- for (;;) {
- if (h == AVL_NULL)
- /* No node in tree with given key. */
- return(AVL_NULL);
-
- cmp = AVL_COMPARE_KEY_NODE(k, h);
-
- if (cmp == 0)
- /* Found node to remove. */
- break;
-
- parent = h;
-
- if (cmp > 0) {
- h = AVL_GET_GREATER(h, 1);
- L_BIT_ARR_1(branch, depth)
- } else {
- h = AVL_GET_LESS(h, 1);
- L_BIT_ARR_0(branch, depth)
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- depth++;
- cmp_shortened_sub_with_path = cmp;
- }
-
- rm = h;
- parent_rm = parent;
- rm_depth = depth;
-
- /* If the node to remove is not a leaf node, we need to get a
- ** leaf node, or a node with a single leaf as its child, to put
- ** in the place of the node to remove. We will get the greatest
- ** node in the less subtree (of the node to remove), or the least
- ** node in the greater subtree. We take the leaf node from the
- ** deeper subtree, if there is one. */
-
- if (AVL_GET_BALANCE_FACTOR(h) < 0) {
- child = AVL_GET_LESS(h, 1);
- L_BIT_ARR_0(branch, depth)
- cmp = -1;
- } else {
- child = AVL_GET_GREATER(h, 1);
- L_BIT_ARR_1(branch, depth)
- cmp = 1;
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- depth++;
-
- if (child != AVL_NULL) {
- cmp = -cmp;
-
- do {
- parent = h;
- h = child;
-
- if (cmp < 0) {
- child = AVL_GET_LESS(h, 1);
- L_BIT_ARR_0(branch, depth)
- } else {
- child = AVL_GET_GREATER(h, 1);
- L_BIT_ARR_1(branch, depth)
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- depth++;
- } while (child != AVL_NULL);
-
- if (parent == rm)
- /* Only went through do loop once. Deleted node will be replaced
- ** in the tree structure by one of its immediate children. */
- cmp_shortened_sub_with_path = -cmp;
- else
- cmp_shortened_sub_with_path = cmp;
-
- /* Get the handle of the opposite child, which may not be null. */
- child = cmp > 0 ? AVL_GET_LESS(h, 0) : AVL_GET_GREATER(h, 0);
- }
-
- if (parent == AVL_NULL)
- /* There were only 1 or 2 nodes in this tree. */
- l_tree->root = child;
- else if (cmp_shortened_sub_with_path < 0)
- AVL_SET_LESS(parent, child)
- else
- AVL_SET_GREATER(parent, child)
-
- /* "path" is the parent of the subtree being eliminated or reduced
- ** from a depth of 2 to 1. If "path" is the node to be removed, we
- ** set path to the node we're about to poke into the position of the
- ** node to be removed. */
- path = parent == rm ? h : parent;
-
- if (h != rm) {
- /* Poke in the replacement for the node to be removed. */
- AVL_SET_LESS(h, AVL_GET_LESS(rm, 0))
- AVL_SET_GREATER(h, AVL_GET_GREATER(rm, 0))
- AVL_SET_BALANCE_FACTOR(h, AVL_GET_BALANCE_FACTOR(rm))
-
- if (parent_rm == AVL_NULL)
- l_tree->root = h;
- else {
- depth = rm_depth - 1;
-
- if (L_BIT_ARR_VAL(branch, depth))
- AVL_SET_GREATER(parent_rm, h)
- else
- AVL_SET_LESS(parent_rm, h)
- }
- }
-
- if (path != AVL_NULL) {
- /* Create a temporary linked list from the parent of the path node
- ** to the root node. */
- h = l_tree->root;
- parent = AVL_NULL;
- depth = 0;
-
- while (h != path) {
- if (L_BIT_ARR_VAL(branch, depth)) {
- child = AVL_GET_GREATER(h, 1);
- AVL_SET_GREATER(h, parent)
- } else {
- child = AVL_GET_LESS(h, 1);
- AVL_SET_LESS(h, parent)
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- depth++;
- parent = h;
- h = child;
- }
-
- /* Climb from the path node to the root node using the linked
- ** list, restoring the tree structure and rebalancing as necessary.
- */
- reduced_depth = 1;
- cmp = cmp_shortened_sub_with_path;
-
- for (;;) {
- if (reduced_depth) {
- bf = AVL_GET_BALANCE_FACTOR(h);
-
- if (cmp < 0)
- bf++;
- else /* cmp > 0 */
- bf--;
-
- if ((bf == -2) || (bf == 2)) {
- h = L_(balance)(L_BALANCE_PARAM_CALL_PREFIX h);
- L_CHECK_READ_ERROR(AVL_NULL)
- bf = AVL_GET_BALANCE_FACTOR(h);
- } else
- AVL_SET_BALANCE_FACTOR(h, bf)
- reduced_depth = (bf == 0);
- }
-
- if (parent == AVL_NULL)
- break;
-
- child = h;
- h = parent;
- depth--;
- cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-
- if (cmp < 0) {
- parent = AVL_GET_LESS(h, 1);
- AVL_SET_LESS(h, child)
- } else {
- parent = AVL_GET_GREATER(h, 1);
- AVL_SET_GREATER(h, child)
- }
-
- L_CHECK_READ_ERROR(AVL_NULL)
- }
-
- l_tree->root = h;
- }
-
- return(rm);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SUBST)
-
-L_SC AVL_HANDLE L_(subst)(L_(avl) *l_tree, AVL_HANDLE new_node) {
- AVL_HANDLE h = l_tree->root;
- AVL_HANDLE parent = AVL_NULL;
- int cmp, last_cmp;
-
- /* Search for node already in tree with same key. */
- for (;;) {
- if (h == AVL_NULL)
- /* No node in tree with same key as new node. */
- return(AVL_NULL);
-
- cmp = AVL_COMPARE_NODE_NODE(new_node, h);
-
- if (cmp == 0)
- /* Found the node to substitute new one for. */
- break;
-
- last_cmp = cmp;
- parent = h;
- h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR(AVL_NULL)
- }
-
- /* Copy tree housekeeping fields from node in tree to new node. */
- AVL_SET_LESS(new_node, AVL_GET_LESS(h, 0))
- AVL_SET_GREATER(new_node, AVL_GET_GREATER(h, 0))
- AVL_SET_BALANCE_FACTOR(new_node, AVL_GET_BALANCE_FACTOR(h))
-
- if (parent == AVL_NULL)
- /* New node is also new root. */
- l_tree->root = new_node;
- else {
- /* Make parent point to new node. */
- if (last_cmp < 0)
- AVL_SET_LESS(parent, new_node)
- else
- AVL_SET_GREATER(parent, new_node)
- }
-
- return(h);
-}
-
-#endif
-
-#ifdef AVL_BUILD_ITER_TYPE
-
-#if (L_IMPL_MASK & AVL_IMPL_BUILD)
-
-L_SC int L_(build)(
- L_(avl) *l_tree, AVL_BUILD_ITER_TYPE p, L_SIZE num_nodes) {
- /* Gives path to subtree being built. If bit n is false, branch
- ** less from the node at depth n, if true branch greater. */
- L_BIT_ARR_DEFN(branch)
-
- /* If bit n is true, then for the current subtree at depth n, its
- ** greater subtree has one more node than its less subtree. */
- L_BIT_ARR_DEFN(rem)
-
- /* Depth of root node of current subtree. */
- unsigned depth = 0;
-
- /* Number of nodes in current subtree. */
- L_SIZE num_sub = num_nodes;
-
- /* The algorithm relies on a stack of nodes whose less subtree has
- ** been built, but whose greater subtree has not yet been built.
- ** The stack is implemented as linked list. The nodes are linked
- ** together by having the "greater" handle of a node set to the
- ** next node in the list. "less_parent" is the handle of the first
- ** node in the list. */
- AVL_HANDLE less_parent = AVL_NULL;
-
- /* h is root of current subtree, child is one of its children. */
- AVL_HANDLE h;
- AVL_HANDLE child;
-
- if (num_nodes == 0) {
- l_tree->root = AVL_NULL;
- return(1);
- }
-
- for (;;) {
- while (num_sub > 2) {
- /* Subtract one for root of subtree. */
- num_sub--;
-
- if (num_sub & 1)
- L_BIT_ARR_1(rem, depth)
- else
- L_BIT_ARR_0(rem, depth)
- L_BIT_ARR_0(branch, depth)
- depth++;
-
- num_sub >>= 1;
- }
-
- if (num_sub == 2) {
- /* Build a subtree with two nodes, slanting to greater.
- ** I arbitrarily chose to always have the extra node in the
- ** greater subtree when there is an odd number of nodes to
- ** split between the two subtrees. */
-
- h = AVL_BUILD_ITER_VAL(p);
- L_CHECK_READ_ERROR(0)
- AVL_BUILD_ITER_INCR(p)
- child = AVL_BUILD_ITER_VAL(p);
- L_CHECK_READ_ERROR(0)
- AVL_BUILD_ITER_INCR(p)
- AVL_SET_LESS(child, AVL_NULL)
- AVL_SET_GREATER(child, AVL_NULL)
- AVL_SET_BALANCE_FACTOR(child, 0)
- AVL_SET_GREATER(h, child)
- AVL_SET_LESS(h, AVL_NULL)
- AVL_SET_BALANCE_FACTOR(h, 1)
- } else { /* num_sub == 1 */
- /* Build a subtree with one node. */
-
- h = AVL_BUILD_ITER_VAL(p);
- L_CHECK_READ_ERROR(0)
- AVL_BUILD_ITER_INCR(p)
- AVL_SET_LESS(h, AVL_NULL)
- AVL_SET_GREATER(h, AVL_NULL)
- AVL_SET_BALANCE_FACTOR(h, 0)
- }
-
- while (depth) {
- depth--;
-
- if (!L_BIT_ARR_VAL(branch, depth))
- /* We've completed a less subtree. */
- break;
-
- /* We've completed a greater subtree, so attach it to
- ** its parent (that is less than it). We pop the parent
- ** off the stack of less parents. */
- child = h;
- h = less_parent;
- less_parent = AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR(0)
- AVL_SET_GREATER(h, child)
- /* num_sub = 2 * (num_sub - rem[depth]) + rem[depth] + 1 */
- num_sub <<= 1;
- num_sub += L_BIT_ARR_VAL(rem, depth) ? 0 : 1;
-
- if (num_sub & (num_sub - 1))
- /* num_sub is not a power of 2. */
- AVL_SET_BALANCE_FACTOR(h, 0)
- else
- /* num_sub is a power of 2. */
- AVL_SET_BALANCE_FACTOR(h, 1)
- }
-
- if (num_sub == num_nodes)
- /* We've completed the full tree. */
- break;
-
- /* The subtree we've completed is the less subtree of the
- ** next node in the sequence. */
-
- child = h;
- h = AVL_BUILD_ITER_VAL(p);
- L_CHECK_READ_ERROR(0)
- AVL_BUILD_ITER_INCR(p)
- AVL_SET_LESS(h, child)
-
- /* Put h into stack of less parents. */
- AVL_SET_GREATER(h, less_parent)
- less_parent = h;
-
- /* Proceed to creating greater than subtree of h. */
- L_BIT_ARR_1(branch, depth)
- num_sub += L_BIT_ARR_VAL(rem, depth) ? 1 : 0;
- depth++;
-
- } /* end for (;; ) */
-
- l_tree->root = h;
-
- return(1);
-}
-
-#endif
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INIT_ITER)
-
-/* Initialize depth to invalid value, to indicate iterator is
-** invalid. (Depth is zero-base.) It's not necessary to initialize
-** iterators prior to passing them to the "start" function.
-*/
-L_SC void L_(init_iter)(L_(iter) *iter) {
- iter->depth = ~0;
-}
-
-#endif
-
-#ifdef AVL_READ_ERRORS_HAPPEN
-
-#define L_CHECK_READ_ERROR_INV_DEPTH \
- { if (AVL_READ_ERROR) { iter->depth = ~0; return; } }
-
-#else
-
-#define L_CHECK_READ_ERROR_INV_DEPTH
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER)
-
-L_SC void L_(start_iter)(
- L_(avl) *l_tree, L_(iter) *iter, AVL_KEY k, avl_search_type st) {
- AVL_HANDLE h = l_tree->root;
- unsigned d = 0;
- int cmp, target_cmp;
-
- /* Save the tree that we're going to iterate through in a
- ** member variable. */
- iter->tree_ = l_tree;
-
- iter->depth = ~0;
-
- if (h == AVL_NULL)
- /* Tree is empty. */
- return;
-
- if (st & AVL_LESS)
- /* Key can be greater than key of starting node. */
- target_cmp = 1;
- else if (st & AVL_GREATER)
- /* Key can be less than key of starting node. */
- target_cmp = -1;
- else
- /* Key must be same as key of starting node. */
- target_cmp = 0;
-
- for (;;) {
- cmp = AVL_COMPARE_KEY_NODE(k, h);
-
- if (cmp == 0) {
- if (st & AVL_EQUAL) {
- /* Equal node was sought and found as starting node. */
- iter->depth = d;
- break;
- }
-
- cmp = -target_cmp;
- } else if (target_cmp != 0)
- if (!((cmp ^ target_cmp) & L_MASK_HIGH_BIT))
- /* cmp and target_cmp are both negative or both positive. */
- iter->depth = d;
-
- h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR_INV_DEPTH
-
- if (h == AVL_NULL)
- break;
-
- if (cmp > 0)
- L_BIT_ARR_1(iter->branch, d)
- else
- L_BIT_ARR_0(iter->branch, d)
- iter->path_h[d++] = h;
- }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER_LEAST)
-
-L_SC void L_(start_iter_least)(L_(avl) *l_tree, L_(iter) *iter) {
- AVL_HANDLE h = l_tree->root;
-
- iter->tree_ = l_tree;
-
- iter->depth = ~0;
-
- L_BIT_ARR_ALL(iter->branch, 0)
-
- while (h != AVL_NULL) {
- if (iter->depth != ~0)
- iter->path_h[iter->depth] = h;
-
- iter->depth++;
- h = AVL_GET_LESS(h, 1);
- L_CHECK_READ_ERROR_INV_DEPTH
- }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER_GREATEST)
-
-L_SC void L_(start_iter_greatest)(L_(avl) *l_tree, L_(iter) *iter) {
- AVL_HANDLE h = l_tree->root;
-
- iter->tree_ = l_tree;
-
- iter->depth = ~0;
-
- L_BIT_ARR_ALL(iter->branch, 1)
-
- while (h != AVL_NULL) {
- if (iter->depth != ~0)
- iter->path_h[iter->depth] = h;
-
- iter->depth++;
- h = AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR_INV_DEPTH
- }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_GET_ITER)
-
-L_SC AVL_HANDLE L_(get_iter)(L_(iter) *iter) {
- if (iter->depth == ~0)
- return(AVL_NULL);
-
- return(iter->depth == 0 ?
- iter->tree_->root : iter->path_h[iter->depth - 1]);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INCR_ITER)
-
-L_SC void L_(incr_iter)(L_(iter) *iter) {
-#define l_tree (iter->tree_)
-
- if (iter->depth != ~0) {
- AVL_HANDLE h =
- AVL_GET_GREATER((iter->depth == 0 ?
- iter->tree_->root : iter->path_h[iter->depth - 1]), 1);
- L_CHECK_READ_ERROR_INV_DEPTH
-
- if (h == AVL_NULL)
- do {
- if (iter->depth == 0) {
- iter->depth = ~0;
- break;
- }
-
- iter->depth--;
- } while (L_BIT_ARR_VAL(iter->branch, iter->depth));
- else {
- L_BIT_ARR_1(iter->branch, iter->depth)
- iter->path_h[iter->depth++] = h;
-
- for (;;) {
- h = AVL_GET_LESS(h, 1);
- L_CHECK_READ_ERROR_INV_DEPTH
-
- if (h == AVL_NULL)
- break;
-
- L_BIT_ARR_0(iter->branch, iter->depth)
- iter->path_h[iter->depth++] = h;
- }
- }
- }
-
-#undef l_tree
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_DECR_ITER)
-
-L_SC void L_(decr_iter)(L_(iter) *iter) {
-#define l_tree (iter->tree_)
-
- if (iter->depth != ~0) {
- AVL_HANDLE h =
- AVL_GET_LESS((iter->depth == 0 ?
- iter->tree_->root : iter->path_h[iter->depth - 1]), 1);
- L_CHECK_READ_ERROR_INV_DEPTH
-
- if (h == AVL_NULL)
- do {
- if (iter->depth == 0) {
- iter->depth = ~0;
- break;
- }
-
- iter->depth--;
- } while (!L_BIT_ARR_VAL(iter->branch, iter->depth));
- else {
- L_BIT_ARR_0(iter->branch, iter->depth)
- iter->path_h[iter->depth++] = h;
-
- for (;;) {
- h = AVL_GET_GREATER(h, 1);
- L_CHECK_READ_ERROR_INV_DEPTH
-
- if (h == AVL_NULL)
- break;
-
- L_BIT_ARR_1(iter->branch, iter->depth)
- iter->path_h[iter->depth++] = h;
- }
- }
- }
-
-#undef l_tree
-}
-
-#endif
-
-/* Tidy up the preprocessor symbol name space. */
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_MASK_HIGH_BIT
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-#undef L_BIT_ARR_VAL
-#undef L_BIT_ARR_0
-#undef L_BIT_ARR_1
-#undef L_BIT_ARR_ALL
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_BIT_ARR_LONGS
-#undef L_IMPL_MASK
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_SC
-#undef L_BALANCE_PARAM_CALL_PREFIX
-#undef L_BALANCE_PARAM_DECL_PREFIX
-
-#endif // VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h
deleted file mode 100644
index d584b1951c5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-/* External header file for Heap Memory Manager. See documentation in
-** heapmm.html.
-*/
-
-#undef HMM_PROCESS
-
-/* Include once per configuration in a particular translation unit. */
-
-#ifndef HMM_CNFG_NUM
-
-/* Default configuration. */
-
-#ifndef HMM_INC_CNFG_DFLT
-#define HMM_INC_CNFG_DFLT
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 0
-
-/* Test configuration. */
-
-#ifndef HMM_INC_CNFG_0
-#define HMM_INC_CNFG_0
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 1
-
-#ifndef HMM_INC_CNFG_1
-#define HMM_INC_CNFG_1
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 2
-
-#ifndef HMM_INC_CNFG_2
-#define HMM_INC_CNFG_2
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 3
-
-#ifndef HMM_INC_CNFG_3
-#define HMM_INC_CNFG_3
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 4
-
-#ifndef HMM_INC_CNFG_4
-#define HMM_INC_CNFG_4
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 5
-
-#ifndef HMM_INC_CNFG_5
-#define HMM_INC_CNFG_5
-#define HMM_PROCESS
-#endif
-
-#endif
-
-#ifdef HMM_PROCESS
-
-#include "hmm_cnfg.h"
-
-/* Heap descriptor. */
-typedef struct HMM_UNIQUE(structure) {
- /* private: */
-
- /* Pointer to (payload of) root node in AVL tree. This field should
- ** really be the AVL tree descriptor (type avl_avl). But (in the
- ** instantiation of the AVL tree generic package used in package) the
- ** AVL tree descriptor simply contains a pointer to the root. So,
- ** whenever a pointer to the AVL tree descriptor is needed, I use the
- ** cast:
- **
- ** (avl_avl *) &(heap_desc->avl_tree_root)
- **
- ** (where heap_desc is a pointer to a heap descriptor). This trick
- ** allows me to avoid including cavl_if.h in this external header. */
- void *avl_tree_root;
-
- /* Pointer to first byte of last block freed, after any coalescing. */
- void *last_freed;
-
- /* public: */
-
- HMM_UNIQUE(size_bau) num_baus_can_shrink;
- void *end_of_shrinkable_chunk;
-}
-HMM_UNIQUE(descriptor);
-
-/* Prototypes for externally-callable functions. */
-
-void HMM_UNIQUE(init)(HMM_UNIQUE(descriptor) *desc);
-
-void *HMM_UNIQUE(alloc)(
- HMM_UNIQUE(descriptor) *desc, HMM_UNIQUE(size_aau) num_addr_align_units);
-
-/* NOT YET IMPLEMENTED */
-void *HMM_UNIQUE(greedy_alloc)(
- HMM_UNIQUE(descriptor) *desc, HMM_UNIQUE(size_aau) needed_addr_align_units,
- HMM_UNIQUE(size_aau) coveted_addr_align_units);
-
-int HMM_UNIQUE(resize)(
- HMM_UNIQUE(descriptor) *desc, void *mem,
- HMM_UNIQUE(size_aau) num_addr_align_units);
-
-/* NOT YET IMPLEMENTED */
-int HMM_UNIQUE(greedy_resize)(
- HMM_UNIQUE(descriptor) *desc, void *mem,
- HMM_UNIQUE(size_aau) needed_addr_align_units,
- HMM_UNIQUE(size_aau) coveted_addr_align_units);
-
-void HMM_UNIQUE(free)(HMM_UNIQUE(descriptor) *desc, void *mem);
-
-HMM_UNIQUE(size_aau) HMM_UNIQUE(true_size)(void *mem);
-
-HMM_UNIQUE(size_aau) HMM_UNIQUE(largest_available)(
- HMM_UNIQUE(descriptor) *desc);
-
-void HMM_UNIQUE(new_chunk)(
- HMM_UNIQUE(descriptor) *desc, void *start_of_chunk,
- HMM_UNIQUE(size_bau) num_block_align_units);
-
-void HMM_UNIQUE(grow_chunk)(
- HMM_UNIQUE(descriptor) *desc, void *end_of_chunk,
- HMM_UNIQUE(size_bau) num_block_align_units);
-
-/* NOT YET IMPLEMENTED */
-void HMM_UNIQUE(shrink_chunk)(
- HMM_UNIQUE(descriptor) *desc,
- HMM_UNIQUE(size_bau) num_block_align_units);
-
-#endif /* defined HMM_PROCESS */
-#endif // VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h
deleted file mode 100644
index caa8713cfc8..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-/* Configure Heap Memory Manager for processor architecture, compiler,
-** and desired performance characteristics. This file is included
-** by heapmm.h, so these definitions can be used by code external to
-** HMM. You can change the default configuration, and/or create alternate
-** configuration(s).
-*/
-
-/* To allow for multiple configurations of HMM to be used in the same
-** compilation unit, undefine all preprocessor symbols that will be
-** defined below.
-*/
-#undef HMM_ADDR_ALIGN_UNIT
-#undef HMM_BLOCK_ALIGN_UNIT
-#undef HMM_UNIQUE
-#undef HMM_DESC_PARAM
-#undef HMM_SYM_TO_STRING
-#undef HMM_SYM_TO_STRING
-#undef HMM_AUDIT_FAIL
-
-/* Turn X into a string after one macro expansion pass of X. This trick
-** works with both GCC and Visual C++. */
-#define HMM_SYM_TO_STRING(X) HMM_SYM_TO_STRING(X)
-#define HMM_SYM_TO_STRING(X) #X
-
-#ifndef HMM_CNFG_NUM
-
-/* Default configuration. */
-
-/* Use hmm_ prefix to avoid identifier conflicts. */
-#define HMM_UNIQUE(BASE) hmm_ ## BASE
-
-/* Number of bytes in an Address Alignment Unit (AAU). */
-// fwg
-// #define HMM_ADDR_ALIGN_UNIT sizeof(int)
-#define HMM_ADDR_ALIGN_UNIT 32
-
-/* Number of AAUs in a Block Alignment Unit (BAU). */
-#define HMM_BLOCK_ALIGN_UNIT 1
-
-/* Type of unsigned integer big enough to hold the size of a Block in AAUs. */
-typedef unsigned long HMM_UNIQUE(size_aau);
-
-/* Type of unsigned integer big enough to hold the size of a Block/Chunk
-** in BAUs. The high bit will be robbed. */
-typedef unsigned long HMM_UNIQUE(size_bau);
-
-void hmm_dflt_abort(const char *, const char *);
-
-/* Actions upon a self-audit failure. Must expand to a single complete
-** statement. If you remove the definition of this macro, no self-auditing
-** will be performed. */
-#define HMM_AUDIT_FAIL \
- hmm_dflt_abort(__FILE__, HMM_SYM_TO_STRING(__LINE__));
-
-#elif HMM_CNFG_NUM == 0
-
-/* Definitions for testing. */
-
-#define HMM_UNIQUE(BASE) thmm_ ## BASE
-
-#define HMM_ADDR_ALIGN_UNIT sizeof(int)
-
-#define HMM_BLOCK_ALIGN_UNIT 3
-
-typedef unsigned HMM_UNIQUE(size_aau);
-
-typedef unsigned short HMM_UNIQUE(size_bau);
-
-/* Under this test setup, a long jump is done if there is a self-audit
-** failure.
-*/
-
-extern jmp_buf HMM_UNIQUE(jmp_buf);
-extern const char *HMM_UNIQUE(fail_file);
-extern unsigned HMM_UNIQUE(fail_line);
-
-#define HMM_AUDIT_FAIL \
- { HMM_UNIQUE(fail_file) = __FILE__; HMM_UNIQUE(fail_line) = __LINE__; \
- longjmp(HMM_UNIQUE(jmp_buf), 1); }
-
-#elif HMM_CNFG_NUM == 1
-
-/* Put configuration 1 definitions here (if there is a configuration 1). */
-
-#elif HMM_CNFG_NUM == 2
-
-/* Put configuration 2 definitions here. */
-
-#elif HMM_CNFG_NUM == 3
-
-/* Put configuration 3 definitions here. */
-
-#elif HMM_CNFG_NUM == 4
-
-/* Put configuration 4 definitions here. */
-
-#elif HMM_CNFG_NUM == 5
-
-/* Put configuration 5 definitions here. */
-
-#endif
-
-#endif // VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h
deleted file mode 100644
index 7302aa28c24..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1 Author: Walt Karas
-*/
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
-
-#ifdef __uClinux__
-# include <lddk.h>
-#endif
-
-#include "heapmm.h"
-
-#define U(BASE) HMM_UNIQUE(BASE)
-
-/* Mask of high bit of variable of size_bau type. */
-#define HIGH_BIT_BAU_SIZE \
- ((U(size_bau)) ~ (((U(size_bau)) ~ (U(size_bau)) 0) >> 1))
-
-/* Add a given number of AAUs to pointer. */
-#define AAUS_FORWARD(PTR, AAU_OFFSET) \
- (((char *) (PTR)) + ((AAU_OFFSET) * ((U(size_aau)) HMM_ADDR_ALIGN_UNIT)))
-
-/* Subtract a given number of AAUs from pointer. */
-#define AAUS_BACKWARD(PTR, AAU_OFFSET) \
- (((char *) (PTR)) - ((AAU_OFFSET) * ((U(size_aau)) HMM_ADDR_ALIGN_UNIT)))
-
-/* Add a given number of BAUs to a pointer. */
-#define BAUS_FORWARD(PTR, BAU_OFFSET) \
- AAUS_FORWARD((PTR), (BAU_OFFSET) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT))
-
-/* Subtract a given number of BAUs to a pointer. */
-#define BAUS_BACKWARD(PTR, BAU_OFFSET) \
- AAUS_BACKWARD((PTR), (BAU_OFFSET) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT))
-
-typedef struct head_struct {
- /* Sizes in Block Alignment Units. */
- HMM_UNIQUE(size_bau) previous_block_size, block_size;
-}
-head_record;
-
-typedef struct ptr_struct {
- struct ptr_struct *self, *prev, *next;
-}
-ptr_record;
-
-/* Divide and round up any fraction to the next whole number. */
-#define DIV_ROUND_UP(NUMER, DENOM) (((NUMER) + (DENOM) - 1) / (DENOM))
-
-/* Number of AAUs in a block head. */
-#define HEAD_AAUS DIV_ROUND_UP(sizeof(head_record), HMM_ADDR_ALIGN_UNIT)
-
-/* Number of AAUs in a block pointer record. */
-#define PTR_RECORD_AAUS DIV_ROUND_UP(sizeof(ptr_record), HMM_ADDR_ALIGN_UNIT)
-
-/* Number of BAUs in a dummy end record (at end of chunk). */
-#define DUMMY_END_BLOCK_BAUS DIV_ROUND_UP(HEAD_AAUS, HMM_BLOCK_ALIGN_UNIT)
-
-/* Minimum number of BAUs in a block (allowing room for the pointer record. */
-#define MIN_BLOCK_BAUS \
- DIV_ROUND_UP(HEAD_AAUS + PTR_RECORD_AAUS, HMM_BLOCK_ALIGN_UNIT)
-
-/* Return number of BAUs in block (masking off high bit containing block
-** status). */
-#define BLOCK_BAUS(HEAD_PTR) \
- (((head_record *) (HEAD_PTR))->block_size & ~HIGH_BIT_BAU_SIZE)
-
-/* Return number of BAUs in previous block (masking off high bit containing
-** block status). */
-#define PREV_BLOCK_BAUS(HEAD_PTR) \
- (((head_record *) (HEAD_PTR))->previous_block_size & ~HIGH_BIT_BAU_SIZE)
-
-/* Set number of BAUs in previous block, preserving high bit containing
-** block status. */
-#define SET_PREV_BLOCK_BAUS(HEAD_PTR, N_BAUS) \
- { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
- h_ptr->previous_block_size &= HIGH_BIT_BAU_SIZE; \
- h_ptr->previous_block_size |= (N_BAUS); }
-
-/* Convert pointer to pointer record of block to pointer to block's head
-** record. */
-#define PTR_REC_TO_HEAD(PTR_REC_PTR) \
- ((head_record *) AAUS_BACKWARD(PTR_REC_PTR, HEAD_AAUS))
-
-/* Convert pointer to block head to pointer to block's pointer record. */
-#define HEAD_TO_PTR_REC(HEAD_PTR) \
- ((ptr_record *) AAUS_FORWARD(HEAD_PTR, HEAD_AAUS))
-
-/* Returns non-zero if block is allocated. */
-#define IS_BLOCK_ALLOCATED(HEAD_PTR) \
- (((((head_record *) (HEAD_PTR))->block_size | \
- ((head_record *) (HEAD_PTR))->previous_block_size) & \
- HIGH_BIT_BAU_SIZE) == 0)
-
-#define MARK_BLOCK_ALLOCATED(HEAD_PTR) \
- { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
- h_ptr->block_size &= ~HIGH_BIT_BAU_SIZE; \
- h_ptr->previous_block_size &= ~HIGH_BIT_BAU_SIZE; }
-
-/* Mark a block as free when it is not the first block in a bin (and
-** therefore not a node in the AVL tree). */
-#define MARK_SUCCESSIVE_BLOCK_IN_FREE_BIN(HEAD_PTR) \
- { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
- h_ptr->block_size |= HIGH_BIT_BAU_SIZE; }
-
-/* Prototypes for internal functions implemented in one file and called in
-** another.
-*/
-
-void U(into_free_collection)(U(descriptor) *desc, head_record *head_ptr);
-
-void U(out_of_free_collection)(U(descriptor) *desc, head_record *head_ptr);
-
-void *U(alloc_from_bin)(
- U(descriptor) *desc, ptr_record *bin_front_ptr, U(size_bau) n_baus);
-
-#ifdef HMM_AUDIT_FAIL
-
-/* Simply contains a reference to the HMM_AUDIT_FAIL macro and a
-** dummy return. */
-int U(audit_block_fail_dummy_return)(void);
-
-
-/* Auditing a block consists of checking that the size in its head
-** matches the previous block size in the head of the next block. */
-#define AUDIT_BLOCK_AS_EXPR(HEAD_PTR) \
- ((BLOCK_BAUS(HEAD_PTR) == \
- PREV_BLOCK_BAUS(BAUS_FORWARD(HEAD_PTR, BLOCK_BAUS(HEAD_PTR)))) ? \
- 0 : U(audit_block_fail_dummy_return)())
-
-#define AUDIT_BLOCK(HEAD_PTR) \
- { void *h_ptr = (HEAD_PTR); AUDIT_BLOCK_AS_EXPR(h_ptr); }
-
-#endif
-
-/* Interface to AVL tree generic package instantiation. */
-
-#define AVL_UNIQUE(BASE) U(avl_ ## BASE)
-
-#define AVL_HANDLE ptr_record *
-
-#define AVL_KEY U(size_bau)
-
-#define AVL_MAX_DEPTH 64
-
-#include "cavl_if.h"
-
-#endif // VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
index da616425c69..b60d7319cc3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
@@ -18,113 +18,11 @@
#include "include/vpx_mem_intrnl.h"
#include "vpx/vpx_integer.h"
-#if CONFIG_MEM_TRACKER
-#ifndef VPX_NO_GLOBALS
-static unsigned long g_alloc_count = 0;
-#else
-#include "vpx_global_handling.h"
-#define g_alloc_count vpxglobalm(vpxmem,g_alloc_count)
-#endif
-#endif
-
-#if CONFIG_MEM_MANAGER
-# include "heapmm.h"
-# include "hmm_intrnl.h"
-
-# define SHIFT_HMM_ADDR_ALIGN_UNIT 5
-# define TOTAL_MEMORY_TO_ALLOCATE 20971520 /* 20 * 1024 * 1024 */
-
-# define MM_DYNAMIC_MEMORY 1
-# if MM_DYNAMIC_MEMORY
-static unsigned char *g_p_mng_memory_raw = NULL;
-static unsigned char *g_p_mng_memory = NULL;
-# else
-static unsigned char g_p_mng_memory[TOTAL_MEMORY_TO_ALLOCATE];
-# endif
-
-static size_t g_mm_memory_size = TOTAL_MEMORY_TO_ALLOCATE;
-
-static hmm_descriptor hmm_d;
-static int g_mng_memory_allocated = 0;
-
-static int vpx_mm_create_heap_memory();
-static void *vpx_mm_realloc(void *memblk, size_t size);
-#endif /*CONFIG_MEM_MANAGER*/
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-struct GLOBAL_FUNC_POINTERS {
- g_malloc_func g_malloc;
- g_calloc_func g_calloc;
- g_realloc_func g_realloc;
- g_free_func g_free;
- g_memcpy_func g_memcpy;
- g_memset_func g_memset;
- g_memmove_func g_memmove;
-} *g_func = NULL;
-
-# define VPX_MALLOC_L g_func->g_malloc
-# define VPX_REALLOC_L g_func->g_realloc
-# define VPX_FREE_L g_func->g_free
-# define VPX_MEMCPY_L g_func->g_memcpy
-# define VPX_MEMSET_L g_func->g_memset
-# define VPX_MEMMOVE_L g_func->g_memmove
-#else
-# define VPX_MALLOC_L malloc
-# define VPX_REALLOC_L realloc
-# define VPX_FREE_L free
-# define VPX_MEMCPY_L memcpy
-# define VPX_MEMSET_L memset
-# define VPX_MEMMOVE_L memmove
-#endif /* USE_GLOBAL_FUNCTION_POINTERS */
-
-unsigned int vpx_mem_get_version() {
- unsigned int ver = ((unsigned int)(unsigned char)VPX_MEM_VERSION_CHIEF << 24 |
- (unsigned int)(unsigned char)VPX_MEM_VERSION_MAJOR << 16 |
- (unsigned int)(unsigned char)VPX_MEM_VERSION_MINOR << 8 |
- (unsigned int)(unsigned char)VPX_MEM_VERSION_PATCH);
- return ver;
-}
-
-int vpx_mem_set_heap_size(size_t size) {
- int ret = -1;
-
-#if CONFIG_MEM_MANAGER
-#if MM_DYNAMIC_MEMORY
-
- if (!g_mng_memory_allocated && size) {
- g_mm_memory_size = size;
- ret = 0;
- } else
- ret = -3;
-
-#else
- ret = -2;
-#endif
-#else
- (void)size;
-#endif
-
- return ret;
-}
-
void *vpx_memalign(size_t align, size_t size) {
void *addr,
* x = NULL;
-#if CONFIG_MEM_MANAGER
- int number_aau;
-
- if (vpx_mm_create_heap_memory() < 0) {
- _P(printf("[vpx][mm] ERROR vpx_memalign() Couldn't create memory for Heap.\n");)
- }
-
- number_aau = ((size + align - 1 + ADDRESS_STORAGE_SIZE) >>
- SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
- addr = hmm_alloc(&hmm_d, number_aau);
-#else
- addr = VPX_MALLOC_L(size + align - 1 + ADDRESS_STORAGE_SIZE);
-#endif /*CONFIG_MEM_MANAGER*/
+ addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
if (addr) {
x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
@@ -145,7 +43,7 @@ void *vpx_calloc(size_t num, size_t size) {
x = vpx_memalign(DEFAULT_ALIGNMENT, num * size);
if (x)
- VPX_MEMSET_L(x, 0, num * size);
+ memset(x, 0, num * size);
return x;
}
@@ -171,11 +69,7 @@ void *vpx_realloc(void *memblk, size_t size) {
addr = (void *)(((size_t *)memblk)[-1]);
memblk = NULL;
-#if CONFIG_MEM_MANAGER
- new_addr = vpx_mm_realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
-#else
- new_addr = VPX_REALLOC_L(addr, size + align + ADDRESS_STORAGE_SIZE);
-#endif
+ new_addr = realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
if (new_addr) {
addr = new_addr;
@@ -193,280 +87,12 @@ void *vpx_realloc(void *memblk, size_t size) {
void vpx_free(void *memblk) {
if (memblk) {
void *addr = (void *)(((size_t *)memblk)[-1]);
-#if CONFIG_MEM_MANAGER
- hmm_free(&hmm_d, addr);
-#else
- VPX_FREE_L(addr);
-#endif
- }
-}
-
-#if CONFIG_MEM_TRACKER
-void *xvpx_memalign(size_t align, size_t size, char *file, int line) {
-#if TRY_BOUNDS_CHECK
- unsigned char *x_bounds;
-#endif
-
- void *x;
-
- if (g_alloc_count == 0) {
-#if TRY_BOUNDS_CHECK
- int i_rv = vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE);
-#else
- int i_rv = vpx_memory_tracker_init(0, 0);
-#endif
-
- if (i_rv < 0) {
- _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
- }
- }
-
-#if TRY_BOUNDS_CHECK
- {
- int i;
- unsigned int tempme = BOUNDS_CHECK_VALUE;
-
- x_bounds = vpx_memalign(align, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
- if (x_bounds) {
- /*we're aligning the address twice here but to keep things
- consistent we want to have the padding come before the stored
- address so no matter what free function gets called we will
- attempt to free the correct address*/
- x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
- x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
- (int)align);
- /* save the actual malloc address */
- ((size_t *)x)[-1] = (size_t)x_bounds;
-
- for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int)) {
- VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
- VPX_MEMCPY_L((unsigned char *)x + size + i,
- &tempme, sizeof(unsigned int));
- }
- } else
- x = NULL;
- }
-#else
- x = vpx_memalign(align, size);
-#endif /*TRY_BOUNDS_CHECK*/
-
- g_alloc_count++;
-
- vpx_memory_tracker_add((size_t)x, (unsigned int)size, file, line, 1);
-
- return x;
-}
-
-void *xvpx_malloc(size_t size, char *file, int line) {
- return xvpx_memalign(DEFAULT_ALIGNMENT, size, file, line);
-}
-
-void *xvpx_calloc(size_t num, size_t size, char *file, int line) {
- void *x = xvpx_memalign(DEFAULT_ALIGNMENT, num * size, file, line);
-
- if (x)
- VPX_MEMSET_L(x, 0, num * size);
-
- return x;
-}
-
-void *xvpx_realloc(void *memblk, size_t size, char *file, int line) {
- struct mem_block *p = NULL;
- int orig_size = 0,
- orig_line = 0;
- char *orig_file = NULL;
-
-#if TRY_BOUNDS_CHECK
- unsigned char *x_bounds = memblk ?
- (unsigned char *)(((size_t *)memblk)[-1]) :
- NULL;
-#endif
-
- void *x;
-
- if (g_alloc_count == 0) {
-#if TRY_BOUNDS_CHECK
-
- if (!vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE))
-#else
- if (!vpx_memory_tracker_init(0, 0))
-#endif
- {
- _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
- }
- }
-
- if ((p = vpx_memory_tracker_find((size_t)memblk))) {
- orig_size = p->size;
- orig_file = p->file;
- orig_line = p->line;
- }
-
-#if TRY_BOUNDS_CHECK_ON_FREE
- vpx_memory_tracker_check_integrity(file, line);
-#endif
-
- /* have to do this regardless of success, because
- * the memory that does get realloc'd may change
- * the bounds values of this block
- */
- vpx_memory_tracker_remove((size_t)memblk);
-
-#if TRY_BOUNDS_CHECK
- {
- int i;
- unsigned int tempme = BOUNDS_CHECK_VALUE;
-
- x_bounds = vpx_realloc(memblk, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
- if (x_bounds) {
- x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
- x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
- (int)DEFAULT_ALIGNMENT);
- /* save the actual malloc address */
- ((size_t *)x)[-1] = (size_t)x_bounds;
-
- for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int)) {
- VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
- VPX_MEMCPY_L((unsigned char *)x + size + i,
- &tempme, sizeof(unsigned int));
- }
- } else
- x = NULL;
+ free(addr);
}
-#else
- x = vpx_realloc(memblk, size);
-#endif /*TRY_BOUNDS_CHECK*/
-
- if (!memblk) ++g_alloc_count;
-
- if (x)
- vpx_memory_tracker_add((size_t)x, (unsigned int)size, file, line, 1);
- else
- vpx_memory_tracker_add((size_t)memblk, orig_size, orig_file, orig_line, 1);
-
- return x;
-}
-
-void xvpx_free(void *p_address, char *file, int line) {
-#if TRY_BOUNDS_CHECK
- unsigned char *p_bounds_address = (unsigned char *)p_address;
- /*p_bounds_address -= BOUNDS_CHECK_PAD_SIZE;*/
-#endif
-
-#if !TRY_BOUNDS_CHECK_ON_FREE
- (void)file;
- (void)line;
-#endif
-
- if (p_address) {
-#if TRY_BOUNDS_CHECK_ON_FREE
- vpx_memory_tracker_check_integrity(file, line);
-#endif
-
- /* if the addr isn't found in the list, assume it was allocated via
- * vpx_ calls not xvpx_, therefore it does not contain any padding
- */
- if (vpx_memory_tracker_remove((size_t)p_address) == -2) {
- p_bounds_address = p_address;
- _P(fprintf(stderr, "[vpx_mem][xvpx_free] addr: %p not found in"
- " list; freed from file:%s"
- " line:%d\n", p_address, file, line));
- } else
- --g_alloc_count;
-
-#if TRY_BOUNDS_CHECK
- vpx_free(p_bounds_address);
-#else
- vpx_free(p_address);
-#endif
-
- if (!g_alloc_count)
- vpx_memory_tracker_destroy();
- }
-}
-
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if CONFIG_MEM_CHECKS
-#if defined(VXWORKS)
-#include <task_lib.h> /*for task_delay()*/
-/* This function is only used to get a stack trace of the player
-object so we can se where we are having a problem. */
-static int get_my_tt(int task) {
- tt(task);
-
- return 0;
-}
-
-static void vx_sleep(int msec) {
- int ticks_to_sleep = 0;
-
- if (msec) {
- int msec_per_tick = 1000 / sys_clk_rate_get();
-
- if (msec < msec_per_tick)
- ticks_to_sleep++;
- else
- ticks_to_sleep = msec / msec_per_tick;
- }
-
- task_delay(ticks_to_sleep);
-}
-#endif
-#endif
-
-void *vpx_memcpy(void *dest, const void *source, size_t length) {
-#if CONFIG_MEM_CHECKS
-
- if (((int)dest < 0x4000) || ((int)source < 0x4000)) {
- _P(printf("WARNING: vpx_memcpy dest:0x%x source:0x%x len:%d\n", (int)dest, (int)source, length);)
-
-#if defined(VXWORKS)
- sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
- vx_sleep(10000);
-#endif
- }
-
-#endif
-
- return VPX_MEMCPY_L(dest, source, length);
-}
-
-void *vpx_memset(void *dest, int val, size_t length) {
-#if CONFIG_MEM_CHECKS
-
- if ((int)dest < 0x4000) {
- _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n", (int)dest, val, length);)
-
-#if defined(VXWORKS)
- sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
- vx_sleep(10000);
-#endif
- }
-
-#endif
-
- return VPX_MEMSET_L(dest, val, length);
}
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
void *vpx_memset16(void *dest, int val, size_t length) {
-#if CONFIG_MEM_CHECKS
- if ((int)dest < 0x4000) {
- _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n",
- (int)dest, val, length);)
-
-#if defined(VXWORKS)
- sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
- vx_sleep(10000);
-#endif
- }
-#endif
int i;
void *orig = dest;
uint16_t *dest16 = dest;
@@ -475,207 +101,3 @@ void *vpx_memset16(void *dest, int val, size_t length) {
return orig;
}
#endif // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
-
-void *vpx_memmove(void *dest, const void *src, size_t count) {
-#if CONFIG_MEM_CHECKS
-
- if (((int)dest < 0x4000) || ((int)src < 0x4000)) {
- _P(printf("WARNING: vpx_memmove dest:0x%x src:0x%x count:%d\n", (int)dest, (int)src, count);)
-
-#if defined(VXWORKS)
- sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
- vx_sleep(10000);
-#endif
- }
-
-#endif
-
- return VPX_MEMMOVE_L(dest, src, count);
-}
-
-#if CONFIG_MEM_MANAGER
-
-static int vpx_mm_create_heap_memory() {
- int i_rv = 0;
-
- if (!g_mng_memory_allocated) {
-#if MM_DYNAMIC_MEMORY
- g_p_mng_memory_raw =
- (unsigned char *)malloc(g_mm_memory_size + HMM_ADDR_ALIGN_UNIT);
-
- if (g_p_mng_memory_raw) {
- g_p_mng_memory = (unsigned char *)((((unsigned int)g_p_mng_memory_raw) +
- HMM_ADDR_ALIGN_UNIT - 1) &
- -(int)HMM_ADDR_ALIGN_UNIT);
-
- _P(printf("[vpx][mm] total memory size:%d g_p_mng_memory_raw:0x%x g_p_mng_memory:0x%x\n"
-, g_mm_memory_size + HMM_ADDR_ALIGN_UNIT
-, (unsigned int)g_p_mng_memory_raw
-, (unsigned int)g_p_mng_memory);)
- } else {
- _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-, g_mm_memory_size);)
-
- i_rv = -1;
- }
-
- if (g_p_mng_memory)
-#endif
- {
- int chunk_size = 0;
-
- g_mng_memory_allocated = 1;
-
- hmm_init(&hmm_d);
-
- chunk_size = g_mm_memory_size >> SHIFT_HMM_ADDR_ALIGN_UNIT;
-
- chunk_size -= DUMMY_END_BLOCK_BAUS;
-
- _P(printf("[vpx][mm] memory size:%d for vpx memory manager. g_p_mng_memory:0x%x chunk_size:%d\n"
-, g_mm_memory_size
-, (unsigned int)g_p_mng_memory
-, chunk_size);)
-
- hmm_new_chunk(&hmm_d, (void *)g_p_mng_memory, chunk_size);
- }
-
-#if MM_DYNAMIC_MEMORY
- else {
- _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-, g_mm_memory_size);)
-
- i_rv = -1;
- }
-
-#endif
- }
-
- return i_rv;
-}
-
-static void *vpx_mm_realloc(void *memblk, size_t size) {
- void *p_ret = NULL;
-
- if (vpx_mm_create_heap_memory() < 0) {
- _P(printf("[vpx][mm] ERROR vpx_mm_realloc() Couldn't create memory for Heap.\n");)
- } else {
- int i_rv = 0;
- int old_num_aaus;
- int new_num_aaus;
-
- old_num_aaus = hmm_true_size(memblk);
- new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
- if (old_num_aaus == new_num_aaus) {
- p_ret = memblk;
- } else {
- i_rv = hmm_resize(&hmm_d, memblk, new_num_aaus);
-
- if (i_rv == 0) {
- p_ret = memblk;
- } else {
- /* Error. Try to malloc and then copy data. */
- void *p_from_malloc;
-
- new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
- p_from_malloc = hmm_alloc(&hmm_d, new_num_aaus);
-
- if (p_from_malloc) {
- vpx_memcpy(p_from_malloc, memblk, size);
- hmm_free(&hmm_d, memblk);
-
- p_ret = p_from_malloc;
- }
- }
- }
- }
-
- return p_ret;
-}
-#endif /*CONFIG_MEM_MANAGER*/
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-# if CONFIG_MEM_TRACKER
-extern int vpx_memory_tracker_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l);
-# endif
-#endif /*USE_GLOBAL_FUNCTION_POINTERS*/
-int vpx_mem_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l) {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
- /* If use global functions is turned on then the
- application must set the global functions before
- it does anything else or vpx_mem will have
- unpredictable results. */
- if (!g_func) {
- g_func = (struct GLOBAL_FUNC_POINTERS *)
- g_malloc_l(sizeof(struct GLOBAL_FUNC_POINTERS));
-
- if (!g_func) {
- return -1;
- }
- }
-
-#if CONFIG_MEM_TRACKER
- {
- int rv = 0;
- rv = vpx_memory_tracker_set_functions(g_malloc_l
-, g_calloc_l
-, g_realloc_l
-, g_free_l
-, g_memcpy_l
-, g_memset_l
-, g_memmove_l);
-
- if (rv < 0) {
- return rv;
- }
- }
-#endif
-
- g_func->g_malloc = g_malloc_l;
- g_func->g_calloc = g_calloc_l;
- g_func->g_realloc = g_realloc_l;
- g_func->g_free = g_free_l;
- g_func->g_memcpy = g_memcpy_l;
- g_func->g_memset = g_memset_l;
- g_func->g_memmove = g_memmove_l;
-
- return 0;
-#else
- (void)g_malloc_l;
- (void)g_calloc_l;
- (void)g_realloc_l;
- (void)g_free_l;
- (void)g_memcpy_l;
- (void)g_memset_l;
- (void)g_memmove_l;
- return -1;
-#endif
-}
-
-int vpx_mem_unset_functions() {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
- if (g_func) {
- g_free_func temp_free = g_func->g_free;
- temp_free(g_func);
- g_func = NULL;
- }
-
-#endif
- return 0;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
index e2391f49629..a027714a01b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
@@ -17,27 +17,6 @@
# include <lddk.h>
#endif
-/* vpx_mem version info */
-#define vpx_mem_version "2.2.1.5"
-
-#define VPX_MEM_VERSION_CHIEF 2
-#define VPX_MEM_VERSION_MAJOR 2
-#define VPX_MEM_VERSION_MINOR 1
-#define VPX_MEM_VERSION_PATCH 5
-/* end - vpx_mem version info */
-
-#ifndef VPX_TRACK_MEM_USAGE
-# define VPX_TRACK_MEM_USAGE 0 /* enable memory tracking/integrity checks */
-#endif
-#ifndef VPX_CHECK_MEM_FUNCTIONS
-# define VPX_CHECK_MEM_FUNCTIONS 0 /* enable basic safety checks in _memcpy,
-_memset, and _memmove */
-#endif
-#ifndef REPLACE_BUILTIN_FUNCTIONS
-# define REPLACE_BUILTIN_FUNCTIONS 0 /* replace builtin functions with their
-vpx_ equivalents */
-#endif
-
#include <stdlib.h>
#include <stddef.h>
@@ -45,125 +24,17 @@ vpx_ equivalents */
extern "C" {
#endif
- /*
- vpx_mem_get_version()
- provided for runtime version checking. Returns an unsigned int of the form
- CHIEF | MAJOR | MINOR | PATCH, where the chief version number is the high
- order byte.
- */
- unsigned int vpx_mem_get_version(void);
-
- /*
- vpx_mem_set_heap_size(size_t size)
- size - size in bytes for the memory manager to allocate for its heap
- Sets the memory manager's initial heap size
- Return:
- 0: on success
- -1: if memory manager calls have not been included in the vpx_mem lib
- -2: if the memory manager has been compiled to use static memory
- -3: if the memory manager has already allocated its heap
- */
- int vpx_mem_set_heap_size(size_t size);
-
void *vpx_memalign(size_t align, size_t size);
void *vpx_malloc(size_t size);
void *vpx_calloc(size_t num, size_t size);
void *vpx_realloc(void *memblk, size_t size);
void vpx_free(void *memblk);
- void *vpx_memcpy(void *dest, const void *src, size_t length);
- void *vpx_memset(void *dest, int val, size_t length);
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
void *vpx_memset16(void *dest, int val, size_t length);
#endif
- void *vpx_memmove(void *dest, const void *src, size_t count);
-
- /* special memory functions */
- void *vpx_mem_alloc(int id, size_t size, size_t align);
- void vpx_mem_free(int id, void *mem, size_t size);
-
- /* Wrappers to standard library functions. */
- typedef void *(* g_malloc_func)(size_t);
- typedef void *(* g_calloc_func)(size_t, size_t);
- typedef void *(* g_realloc_func)(void *, size_t);
- typedef void (* g_free_func)(void *);
- typedef void *(* g_memcpy_func)(void *, const void *, size_t);
- typedef void *(* g_memset_func)(void *, int, size_t);
- typedef void *(* g_memmove_func)(void *, const void *, size_t);
-
- int vpx_mem_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l);
- int vpx_mem_unset_functions(void);
-
- /* some defines for backward compatibility */
-#define DMEM_GENERAL 0
-
-// (*)<
-
-#if REPLACE_BUILTIN_FUNCTIONS
-# ifndef __VPX_MEM_C__
-# define memalign vpx_memalign
-# define malloc vpx_malloc
-# define calloc vpx_calloc
-# define realloc vpx_realloc
-# define free vpx_free
-# define memcpy vpx_memcpy
-# define memmove vpx_memmove
-# define memset vpx_memset
-# endif
-#endif
-
-#if CONFIG_MEM_TRACKER
-#include <stdarg.h>
- /*from vpx_mem/vpx_mem_tracker.c*/
- extern void vpx_memory_tracker_dump();
- extern void vpx_memory_tracker_check_integrity(char *file, unsigned int line);
- extern int vpx_memory_tracker_set_log_type(int type, char *option);
- extern int vpx_memory_tracker_set_log_func(void *userdata,
- void(*logfunc)(void *userdata,
- const char *fmt, va_list args));
-# ifndef __VPX_MEM_C__
-# define vpx_memalign(align, size) xvpx_memalign((align), (size), __FILE__, __LINE__)
-# define vpx_malloc(size) xvpx_malloc((size), __FILE__, __LINE__)
-# define vpx_calloc(num, size) xvpx_calloc(num, size, __FILE__, __LINE__)
-# define vpx_realloc(addr, size) xvpx_realloc(addr, size, __FILE__, __LINE__)
-# define vpx_free(addr) xvpx_free(addr, __FILE__, __LINE__)
-# define vpx_memory_tracker_check_integrity() vpx_memory_tracker_check_integrity(__FILE__, __LINE__)
-# define vpx_mem_alloc(id,size,align) xvpx_mem_alloc(id, size, align, __FILE__, __LINE__)
-# define vpx_mem_free(id,mem,size) xvpx_mem_free(id, mem, size, __FILE__, __LINE__)
-# endif
-
- void *xvpx_memalign(size_t align, size_t size, char *file, int line);
- void *xvpx_malloc(size_t size, char *file, int line);
- void *xvpx_calloc(size_t num, size_t size, char *file, int line);
- void *xvpx_realloc(void *memblk, size_t size, char *file, int line);
- void xvpx_free(void *memblk, char *file, int line);
- void *xvpx_mem_alloc(int id, size_t size, size_t align, char *file, int line);
- void xvpx_mem_free(int id, void *mem, size_t size, char *file, int line);
-
-#else
-# ifndef __VPX_MEM_C__
-# define vpx_memory_tracker_dump()
-# define vpx_memory_tracker_check_integrity()
-# define vpx_memory_tracker_set_log_type(t,o) 0
-# define vpx_memory_tracker_set_log_func(u,f) 0
-# endif
-#endif
-
-#if !VPX_CHECK_MEM_FUNCTIONS
-# ifndef __VPX_MEM_C__
-# include <string.h>
-# define vpx_memcpy memcpy
-# define vpx_memset memset
-# define vpx_memmove memmove
-# endif
-#endif
+#include <string.h>
#ifdef VPX_MEM_PLTFRM
# include VPX_MEM_PLTFRM
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk
index 4663c5a91f0..7f275eabf92 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk
@@ -2,21 +2,3 @@ MEM_SRCS-yes += vpx_mem.mk
MEM_SRCS-yes += vpx_mem.c
MEM_SRCS-yes += vpx_mem.h
MEM_SRCS-yes += include/vpx_mem_intrnl.h
-
-MEM_SRCS-$(CONFIG_MEM_TRACKER) += vpx_mem_tracker.c
-MEM_SRCS-$(CONFIG_MEM_TRACKER) += include/vpx_mem_tracker.h
-
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_true.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_resize.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_shrink.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_largest.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_dflt_abort.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_base.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/hmm_intrnl.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/cavl_if.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/hmm_cnfg.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/heapmm.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/cavl_impl.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_grow.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_alloc.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c
deleted file mode 100644
index 613e8a16b0e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c
+++ /dev/null
@@ -1,740 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
- vpx_mem_tracker.c
-
- jwz 2003-09-30:
- Stores a list of addreses, their size, and file and line they came from.
- All exposed lib functions are prefaced by vpx_ and allow the global list
- to be thread safe.
- Current supported platforms are:
- Linux, Win32, win_ce and vx_works
- Further support can be added by defining the platform specific mutex
- in the memory_tracker struct as well as calls to create/destroy/lock/unlock
- the mutex in vpx_memory_tracker_init/Destroy and memory_tracker_lock_mutex/unlock_mutex
-*/
-#include "./vpx_config.h"
-
-#if defined(__uClinux__)
-# include <lddk.h>
-#endif
-
-#if HAVE_PTHREAD_H
-# include <pthread.h>
-#elif defined(WIN32) || defined(_WIN32_WCE)
-# define WIN32_LEAN_AND_MEAN
-# include <windows.h>
-# include <winbase.h>
-#elif defined(VXWORKS)
-# include <sem_lib.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // VXWORKS doesn't have a malloc/memory.h file,
-// this should pull in malloc,free,etc.
-#include <stdarg.h>
-
-#include "include/vpx_mem_tracker.h"
-
-#undef vpx_malloc // undefine any vpx_mem macros that may affect calls to
-#undef vpx_free // memory functions in this file
-#undef vpx_memcpy
-#undef vpx_memset
-
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS 0 // use function pointers instead of compiled functions.
-#endif
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-static mem_track_malloc_func g_malloc = malloc;
-static mem_track_calloc_func g_calloc = calloc;
-static mem_track_realloc_func g_realloc = realloc;
-static mem_track_free_func g_free = free;
-static mem_track_memcpy_func g_memcpy = memcpy;
-static mem_track_memset_func g_memset = memset;
-static mem_track_memmove_func g_memmove = memmove;
-# define MEM_TRACK_MALLOC g_malloc
-# define MEM_TRACK_FREE g_free
-# define MEM_TRACK_MEMCPY g_memcpy
-# define MEM_TRACK_MEMSET g_memset
-#else
-# define MEM_TRACK_MALLOC vpx_malloc
-# define MEM_TRACK_FREE vpx_free
-# define MEM_TRACK_MEMCPY vpx_memcpy
-# define MEM_TRACK_MEMSET vpx_memset
-#endif // USE_GLOBAL_FUNCTION_POINTERS
-
-/* prototypes for internal library functions */
-static void memtrack_log(const char *fmt, ...);
-static void memory_tracker_dump();
-static void memory_tracker_check_integrity(char *file, unsigned int line);
-static void memory_tracker_add(size_t addr, unsigned int size,
- char *file, unsigned int line,
- int padded);
-static int memory_tracker_remove(size_t addr);
-static struct mem_block *memory_tracker_find(size_t addr);
-
-#if defined(NO_MUTEX)
-# define memory_tracker_lock_mutex() (!g_b_mem_tracker_inited)
-# define memory_tracker_unlock_mutex()
-#else
-static int memory_tracker_lock_mutex();
-static int memory_tracker_unlock_mutex();
-#endif
-
-#ifndef VPX_NO_GLOBALS
-struct memory_tracker {
- struct mem_block *head,
- * tail;
- int len,
- totalsize;
- unsigned int current_allocated,
- max_allocated;
-
-#if HAVE_PTHREAD_H
- pthread_mutex_t mutex;
-#elif defined(WIN32) || defined(_WIN32_WCE)
- HANDLE mutex;
-#elif defined(VXWORKS)
- SEM_ID mutex;
-#elif defined(NO_MUTEX)
-#else
-#error "No mutex type defined for this platform!"
-#endif
-
- int padding_size,
- pad_value;
-};
-
-static struct memory_tracker memtrack; // our global memory allocation list
-static int g_b_mem_tracker_inited = 0; // indicates whether the global list has
-// been initialized (1:yes/0:no)
-static struct {
- FILE *file;
- int type;
- void (*func)(void *userdata, const char *fmt, va_list args);
- void *userdata;
-} g_logging = {NULL, 0, NULL, NULL};
-#else
-# include "vpx_global_handling.h"
-#define g_b_mem_tracker_inited vpxglobalm(vpxmem,g_b_mem_tracker_inited)
-#define g_logging vpxglobalm(vpxmem,g_logging)
-#define memtrack vpxglobalm(vpxmem,memtrack)
-#endif // #ifndef VPX_NO_GLOBALS
-
-extern void *vpx_malloc(size_t size);
-extern void vpx_free(void *memblk);
-extern void *vpx_memcpy(void *dest, const void *src, size_t length);
-extern void *vpx_memset(void *dest, int val, size_t length);
-
-/*
- *
- * Exposed library functions
- *
-*/
-
-/*
- vpx_memory_tracker_init(int padding_size, int pad_value)
- padding_size - the size of the padding before and after each mem addr.
- Values > 0 indicate that integrity checks can be performed
- by inspecting these areas.
- pad_value - the initial value within the padding area before and after
- each mem addr.
-
- Initializes global memory tracker structure
- Allocates the head of the list
-*/
-int vpx_memory_tracker_init(int padding_size, int pad_value) {
- if (!g_b_mem_tracker_inited) {
- if ((memtrack.head = (struct mem_block *)
- MEM_TRACK_MALLOC(sizeof(struct mem_block)))) {
- int ret;
-
- MEM_TRACK_MEMSET(memtrack.head, 0, sizeof(struct mem_block));
-
- memtrack.tail = memtrack.head;
-
- memtrack.current_allocated = 0;
- memtrack.max_allocated = 0;
-
- memtrack.padding_size = padding_size;
- memtrack.pad_value = pad_value;
-
-#if HAVE_PTHREAD_H
- ret = pthread_mutex_init(&memtrack.mutex,
- NULL); /*mutex attributes (NULL=default)*/
-#elif defined(WIN32) || defined(_WIN32_WCE)
- memtrack.mutex = CreateMutex(NULL, /*security attributes*/
- FALSE, /*we don't want initial ownership*/
- NULL); /*mutex name*/
- ret = !memtrack.mutex;
-#elif defined(VXWORKS)
- memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/
- SEM_FULL); /*SEM_FULL initial state is unlocked*/
- ret = !memtrack.mutex;
-#elif defined(NO_MUTEX)
- ret = 0;
-#endif
-
- if (ret) {
- memtrack_log("vpx_memory_tracker_init: Error creating mutex!\n");
-
- MEM_TRACK_FREE(memtrack.head);
- memtrack.head = NULL;
- } else {
- memtrack_log("Memory Tracker init'd, v."vpx_mem_tracker_version" pad_size:%d pad_val:0x%x %d\n"
-, padding_size
-, pad_value
-, pad_value);
- g_b_mem_tracker_inited = 1;
- }
- }
- }
-
- return g_b_mem_tracker_inited;
-}
-
-/*
- vpx_memory_tracker_destroy()
- If our global struct was initialized zeros out all its members,
- frees memory and destroys it's mutex
-*/
-void vpx_memory_tracker_destroy() {
- if (!memory_tracker_lock_mutex()) {
- struct mem_block *p = memtrack.head,
- * p2 = memtrack.head;
-
- memory_tracker_dump();
-
- while (p) {
- p2 = p;
- p = p->next;
-
- MEM_TRACK_FREE(p2);
- }
-
- memtrack.head = NULL;
- memtrack.tail = NULL;
- memtrack.len = 0;
- memtrack.current_allocated = 0;
- memtrack.max_allocated = 0;
-
- if (!g_logging.type && g_logging.file && g_logging.file != stderr) {
- fclose(g_logging.file);
- g_logging.file = NULL;
- }
-
- memory_tracker_unlock_mutex();
-
- g_b_mem_tracker_inited = 0;
- }
-}
-
-/*
- vpx_memory_tracker_add(size_t addr, unsigned int size,
- char * file, unsigned int line)
- addr - memory address to be added to list
- size - size of addr
- file - the file addr was referenced from
- line - the line in file addr was referenced from
- Adds memory address addr, it's size, file and line it came from
- to the global list via the thread safe internal library function
-*/
-void vpx_memory_tracker_add(size_t addr, unsigned int size,
- char *file, unsigned int line,
- int padded) {
- memory_tracker_add(addr, size, file, line, padded);
-}
-
-/*
- vpx_memory_tracker_remove(size_t addr)
- addr - memory address to be removed from list
- Removes addr from the global list via the thread safe
- internal remove function
- Return:
- Same as described for memory_tracker_remove
-*/
-int vpx_memory_tracker_remove(size_t addr) {
- return memory_tracker_remove(addr);
-}
-
-/*
- vpx_memory_tracker_find(size_t addr)
- addr - address to be found in list
- Return:
- If found, pointer to the memory block that matches addr
- NULL otherwise
-*/
-struct mem_block *vpx_memory_tracker_find(size_t addr) {
- struct mem_block *p = NULL;
-
- if (!memory_tracker_lock_mutex()) {
- p = memory_tracker_find(addr);
- memory_tracker_unlock_mutex();
- }
-
- return p;
-}
-
-/*
- vpx_memory_tracker_dump()
- Locks the memory tracker's mutex and calls the internal
- library function to dump the current contents of the
- global memory allocation list
-*/
-void vpx_memory_tracker_dump() {
- if (!memory_tracker_lock_mutex()) {
- memory_tracker_dump();
- memory_tracker_unlock_mutex();
- }
-}
-
-/*
- vpx_memory_tracker_check_integrity(char* file, unsigned int line)
- file - The file name where the check was placed
- line - The line in file where the check was placed
- Locks the memory tracker's mutex and calls the internal
- integrity check function to inspect every address in the global
- memory allocation list
-*/
-void vpx_memory_tracker_check_integrity(char *file, unsigned int line) {
- if (!memory_tracker_lock_mutex()) {
- memory_tracker_check_integrity(file, line);
- memory_tracker_unlock_mutex();
- }
-}
-
-/*
- vpx_memory_tracker_set_log_type
- Sets the logging type for the memory tracker. Based on the value it will
- direct its output to the appropriate place.
- Return:
- 0: on success
- -1: if the logging type could not be set, because the value was invalid
- or because a file could not be opened
-*/
-int vpx_memory_tracker_set_log_type(int type, char *option) {
- int ret = -1;
-
- switch (type) {
- case 0:
- g_logging.type = 0;
-
- if (!option) {
- g_logging.file = stderr;
- ret = 0;
- } else {
- if ((g_logging.file = fopen((char *)option, "w")))
- ret = 0;
- }
-
- break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
- case 1:
- g_logging.type = type;
- ret = 0;
- break;
-#endif
- default:
- break;
- }
-
- // output the version to the new logging destination
- if (!ret)
- memtrack_log("Memory Tracker logging initialized, "
- "Memory Tracker v."vpx_mem_tracker_version"\n");
-
- return ret;
-}
-
-/*
- vpx_memory_tracker_set_log_func
- Sets a logging function to be used by the memory tracker.
- Return:
- 0: on success
- -1: if the logging type could not be set because logfunc was NULL
-*/
-int vpx_memory_tracker_set_log_func(void *userdata,
- void(*logfunc)(void *userdata,
- const char *fmt, va_list args)) {
- int ret = -1;
-
- if (logfunc) {
- g_logging.type = -1;
- g_logging.userdata = userdata;
- g_logging.func = logfunc;
- ret = 0;
- }
-
- // output the version to the new logging destination
- if (!ret)
- memtrack_log("Memory Tracker logging initialized, "
- "Memory Tracker v."vpx_mem_tracker_version"\n");
-
- return ret;
-}
-
-/*
- *
- * END - Exposed library functions
- *
-*/
-
-
-/*
- *
- * Internal library functions
- *
-*/
-
-static void memtrack_log(const char *fmt, ...) {
- va_list list;
-
- va_start(list, fmt);
-
- switch (g_logging.type) {
- case -1:
-
- if (g_logging.func)
- g_logging.func(g_logging.userdata, fmt, list);
-
- break;
- case 0:
-
- if (g_logging.file) {
- vfprintf(g_logging.file, fmt, list);
- fflush(g_logging.file);
- }
-
- break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
- case 1: {
- char temp[1024];
- _vsnprintf(temp, sizeof(temp) / sizeof(char) - 1, fmt, list);
- OutputDebugString(temp);
- }
- break;
-#endif
- default:
- break;
- }
-
- va_end(list);
-}
-
-/*
- memory_tracker_dump()
- Dumps the current contents of the global memory allocation list
-*/
-static void memory_tracker_dump() {
- int i = 0;
- struct mem_block *p = (memtrack.head ? memtrack.head->next : NULL);
-
- memtrack_log("\n_currently Allocated= %d; Max allocated= %d\n",
- memtrack.current_allocated, memtrack.max_allocated);
-
- while (p) {
-#if defined(WIN32) && !defined(_WIN32_WCE)
-
- /*when using outputdebugstring, output filenames so they
- can be clicked to be opened in visual studio*/
- if (g_logging.type == 1)
- memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file:\n"
- " %s(%d):\n", i,
- p->addr, i, p->size,
- p->file, p->line);
- else
-#endif
- memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file: %s, line: %d\n", i,
- p->addr, i, p->size,
- p->file, p->line);
-
- p = p->next;
- ++i;
- }
-
- memtrack_log("\n");
-}
-
-/*
- memory_tracker_check_integrity(char* file, unsigned int file)
- file - the file name where the check was placed
- line - the line in file where the check was placed
- If a padding_size was supplied to vpx_memory_tracker_init()
- this function will check ea. addr in the list verifying that
- addr-padding_size and addr+padding_size is filled with pad_value
-*/
-static void memory_tracker_check_integrity(char *file, unsigned int line) {
- if (memtrack.padding_size) {
- int i,
- index = 0;
- unsigned char *p_show_me,
- * p_show_me2;
- unsigned int tempme = memtrack.pad_value,
- dead1,
- dead2;
- unsigned char *x_bounds;
- struct mem_block *p = memtrack.head->next;
-
- while (p) {
- // x_bounds = (unsigned char*)p->addr;
- // back up VPX_BYTE_ALIGNMENT
- // x_bounds -= memtrack.padding_size;
-
- if (p->padded) { // can the bounds be checked?
- /*yes, move to the address that was actually allocated
- by the vpx_* calls*/
- x_bounds = (unsigned char *)(((size_t *)p->addr)[-1]);
-
- for (i = 0; i < memtrack.padding_size; i += sizeof(unsigned int)) {
- p_show_me = (x_bounds + i);
- p_show_me2 = (unsigned char *)(p->addr + p->size + i);
-
- MEM_TRACK_MEMCPY(&dead1, p_show_me, sizeof(unsigned int));
- MEM_TRACK_MEMCPY(&dead2, p_show_me2, sizeof(unsigned int));
-
- if ((dead1 != tempme) || (dead2 != tempme)) {
- memtrack_log("\n[vpx_mem integrity check failed]:\n"
- " index[%d,%d] {%s:%d} addr=0x%x, size=%d,"
- " file: %s, line: %d c0:0x%x c1:0x%x\n",
- index, i, file, line, p->addr, p->size, p->file,
- p->line, dead1, dead2);
- }
- }
- }
-
- ++index;
- p = p->next;
- }
- }
-}
-
-/*
- memory_tracker_add(size_t addr, unsigned int size,
- char * file, unsigned int line)
- Adds an address (addr), it's size, file and line number to our list.
- Adjusts the total bytes allocated and max bytes allocated if necessary.
- If memory cannot be allocated the list will be destroyed.
-*/
-void memory_tracker_add(size_t addr, unsigned int size,
- char *file, unsigned int line,
- int padded) {
- if (!memory_tracker_lock_mutex()) {
- struct mem_block *p;
-
- p = MEM_TRACK_MALLOC(sizeof(struct mem_block));
-
- if (p) {
- p->prev = memtrack.tail;
- p->prev->next = p;
- p->addr = addr;
- p->size = size;
- p->line = line;
- p->file = file;
- p->padded = padded;
- p->next = NULL;
-
- memtrack.tail = p;
-
- memtrack.current_allocated += size;
-
- if (memtrack.current_allocated > memtrack.max_allocated)
- memtrack.max_allocated = memtrack.current_allocated;
-
- // memtrack_log("memory_tracker_add: added addr=0x%.8x\n", addr);
-
- memory_tracker_unlock_mutex();
- } else {
- memtrack_log("memory_tracker_add: error allocating memory!\n");
- memory_tracker_unlock_mutex();
- vpx_memory_tracker_destroy();
- }
- }
-}
-
-/*
- memory_tracker_remove(size_t addr)
- Removes an address and its corresponding size (if they exist)
- from the memory tracker list and adjusts the current number
- of bytes allocated.
- Return:
- 0: on success
- -1: if the mutex could not be locked
- -2: if the addr was not found in the list
-*/
-int memory_tracker_remove(size_t addr) {
- int ret = -1;
-
- if (!memory_tracker_lock_mutex()) {
- struct mem_block *p;
-
- if ((p = memory_tracker_find(addr))) {
- memtrack.current_allocated -= p->size;
-
- p->prev->next = p->next;
-
- if (p->next)
- p->next->prev = p->prev;
- else
- memtrack.tail = p->prev;
-
- ret = 0;
- MEM_TRACK_FREE(p);
- } else {
- if (addr)
- memtrack_log("memory_tracker_remove(): addr not found in list,"
- " 0x%.8x\n", addr);
-
- ret = -2;
- }
-
- memory_tracker_unlock_mutex();
- }
-
- return ret;
-}
-
-/*
- memory_tracker_find(size_t addr)
- Finds an address in our addrs list
- NOTE: the mutex MUST be locked in the other internal
- functions before calling this one. This avoids
- the need for repeated locking and unlocking as in Remove
- Returns: pointer to the mem block if found, NULL otherwise
-*/
-static struct mem_block *memory_tracker_find(size_t addr) {
- struct mem_block *p = NULL;
-
- if (memtrack.head) {
- p = memtrack.head->next;
-
- while (p && (p->addr != addr))
- p = p->next;
- }
-
- return p;
-}
-
-
-#if !defined(NO_MUTEX)
-/*
- memory_tracker_lock_mutex()
- Locks the memory tracker mutex with a platform specific call
- Returns:
- 0: Success
- <0: Failure, either the mutex was not initialized
- or the call to lock the mutex failed
-*/
-static int memory_tracker_lock_mutex() {
- int ret = -1;
-
- if (g_b_mem_tracker_inited) {
-
-#if HAVE_PTHREAD_H
- ret = pthread_mutex_lock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
- ret = WaitForSingleObject(memtrack.mutex, INFINITE);
-#elif defined(VXWORKS)
- ret = sem_take(memtrack.mutex, WAIT_FOREVER);
-#endif
-
- if (ret) {
- memtrack_log("memory_tracker_lock_mutex: mutex lock failed\n");
- }
- }
-
- return ret;
-}
-
-/*
- memory_tracker_unlock_mutex()
- Unlocks the memory tracker mutex with a platform specific call
- Returns:
- 0: Success
- <0: Failure, either the mutex was not initialized
- or the call to unlock the mutex failed
-*/
-static int memory_tracker_unlock_mutex() {
- int ret = -1;
-
- if (g_b_mem_tracker_inited) {
-
-#if HAVE_PTHREAD_H
- ret = pthread_mutex_unlock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
- ret = !ReleaseMutex(memtrack.mutex);
-#elif defined(VXWORKS)
- ret = sem_give(memtrack.mutex);
-#endif
-
- if (ret) {
- memtrack_log("memory_tracker_unlock_mutex: mutex unlock failed\n");
- }
- }
-
- return ret;
-}
-#endif
-
-/*
- vpx_memory_tracker_set_functions
-
- Sets the function pointers for the standard library functions.
-
- Return:
- 0: on success
- -1: if the use global function pointers is not set.
-*/
-int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-, mem_track_calloc_func g_calloc_l
-, mem_track_realloc_func g_realloc_l
-, mem_track_free_func g_free_l
-, mem_track_memcpy_func g_memcpy_l
-, mem_track_memset_func g_memset_l
-, mem_track_memmove_func g_memmove_l) {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
- if (g_malloc_l)
- g_malloc = g_malloc_l;
-
- if (g_calloc_l)
- g_calloc = g_calloc_l;
-
- if (g_realloc_l)
- g_realloc = g_realloc_l;
-
- if (g_free_l)
- g_free = g_free_l;
-
- if (g_memcpy_l)
- g_memcpy = g_memcpy_l;
-
- if (g_memset_l)
- g_memset = g_memset_l;
-
- if (g_memmove_l)
- g_memmove = g_memmove_l;
-
- return 0;
-#else
- (void)g_malloc_l;
- (void)g_calloc_l;
- (void)g_realloc_l;
- (void)g_free_l;
- (void)g_memcpy_l;
- (void)g_memset_l;
- (void)g_memmove_l;
- return -1;
-#endif
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
index f03feffbc24..8a4b8af964f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
@@ -49,9 +49,6 @@ int arm_cpu_caps(void) {
return flags;
}
mask = arm_cpu_env_mask();
-#if HAVE_EDSP
- flags |= HAS_EDSP;
-#endif /* HAVE_EDSP */
#if HAVE_MEDIA
flags |= HAS_MEDIA;
#endif /* HAVE_MEDIA */
@@ -78,17 +75,6 @@ int arm_cpu_caps(void) {
* instructions via their assembled hex code.
* All of these instructions should be essentially nops.
*/
-#if HAVE_EDSP
- if (mask & HAS_EDSP) {
- __try {
- /*PLD [r13]*/
- __emit(0xF5DDF000);
- flags |= HAS_EDSP;
- } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
- /*Ignore exception.*/
- }
- }
-#endif /* HAVE_EDSP */
#if HAVE_MEDIA
if (mask & HAS_MEDIA)
__try {
@@ -127,9 +113,6 @@ int arm_cpu_caps(void) {
mask = arm_cpu_env_mask();
features = android_getCpuFeatures();
-#if HAVE_EDSP
- flags |= HAS_EDSP;
-#endif /* HAVE_EDSP */
#if HAVE_MEDIA
flags |= HAS_MEDIA;
#endif /* HAVE_MEDIA */
@@ -163,23 +146,15 @@ int arm_cpu_caps(void) {
*/
char buf[512];
while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM
+#if HAVE_NEON || HAVE_NEON_ASM
if (memcmp(buf, "Features", 8) == 0) {
char *p;
-#if HAVE_EDSP
- p = strstr(buf, " edsp");
- if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
- flags |= HAS_EDSP;
- }
-#endif /* HAVE_EDSP */
-#if HAVE_NEON || HAVE_NEON_ASM
p = strstr(buf, " neon");
if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
flags |= HAS_NEON;
}
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
}
-#endif /* HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
#if HAVE_MEDIA
if (memcmp(buf, "CPU architecture:", 17) == 0) {
int version;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h
deleted file mode 100644
index 317bbedcb13..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPX_PORTS_ASM_OFFSETS_H_
-#define VPX_PORTS_ASM_OFFSETS_H_
-
-#include <stddef.h>
-
-#define ct_assert(name,cond) \
- static void assert_##name(void) UNUSED;\
- static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
-
-#if INLINE_ASM
-#define DEFINE(sym, val) asm("\n" #sym " EQU %0" : : "i" (val))
-#define BEGIN int main(void) {
-#define END return 0; }
-#else
-#define DEFINE(sym, val) const int sym = val
-#define BEGIN
-#define END
-#endif
-
-#endif // VPX_PORTS_ASM_OFFSETS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
index 1cb8c8cd9af..0106a45d6e4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
@@ -24,17 +24,6 @@
#define DECLARE_ALIGNED(n,typ,val) typ val
#endif
-
-/* Declare an aligned array on the stack, for situations where the stack
- * pointer may not have the alignment we expect. Creates an array with a
- * modified name, then defines val to be a pointer, and aligns that pointer
- * within the array.
- */
-#define DECLARE_ALIGNED_ARRAY(a,typ,val,n)\
- typ val##_[(n)+(a)/sizeof(typ)+1];\
- typ *val = (typ*)((((intptr_t)val##_)+(a)-1)&((intptr_t)-(a)))
-
-
/* Indicates that the usage of the specified variable has been audited to assure
* that it's safe to use uninitialized. Silences 'may be used uninitialized'
* warnings on gcc.
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h
index bd9eebd643b..f1df3943457 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h
@@ -110,7 +110,7 @@ static void once(void (*func)(void))
#else
-/* No-op version that performs no synchronization. vp8_rtcd() is idempotent,
+/* No-op version that performs no synchronization. *_rtcd() is idempotent,
* so as long as your platform provides atomic loads/stores of pointers
* no synchronization is strictly necessary.
*/
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
index 869a204fbc7..a7275431fe9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
@@ -11,7 +11,6 @@
PORTS_SRCS-yes += vpx_ports.mk
-PORTS_SRCS-$(BUILD_LIBVPX) += asm_offsets.h
PORTS_SRCS-$(BUILD_LIBVPX) += mem.h
PORTS_SRCS-$(BUILD_LIBVPX) += vpx_timer.h
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
index 81c2b8b873f..7d93710c4b0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
@@ -13,6 +13,7 @@
#define VPX_PORTS_X86_H_
#include <stdlib.h>
#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
#ifdef __cplusplus
extern "C" {
@@ -104,6 +105,37 @@ void __cpuid(int CPUInfo[4], int info_type);
#endif
#endif /* end others */
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+ const uint32_t ecx = 0;
+ uint32_t eax, edx;
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm__ volatile (
+ ".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(eax), "=d"(edx) : "c" (ecx));
+ return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+ defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+ uint32_t eax_, edx_;
+ __asm {
+ xor ecx, ecx // ecx = 0
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+ mov eax_, eax
+ mov edx_, edx
+ }
+ return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
+#endif
+
#define HAS_MMX 0x01
#define HAS_SSE 0x02
#define HAS_SSE2 0x04
@@ -120,7 +152,7 @@ static INLINE int
x86_simd_caps(void) {
unsigned int flags = 0;
unsigned int mask = ~0;
- unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
char *env;
(void)reg_ebx;
@@ -136,9 +168,9 @@ x86_simd_caps(void) {
mask = strtol(env, NULL, 0);
/* Ensure that the CPUID instruction supports extended features */
- cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
- if (reg_eax < 1)
+ if (max_cpuid_val < 1)
return 0;
/* Get the standard feature flags */
@@ -156,14 +188,19 @@ x86_simd_caps(void) {
if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
- if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+ // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+ if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+ if ((xgetbv() & 0x6) == 0x6) {
+ flags |= HAS_AVX;
- /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
- reg_eax = 7;
- reg_ecx = 0;
- cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ if (max_cpuid_val >= 7) {
+ /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+ cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
- if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+ if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+ }
+ }
+ }
return flags & mask;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
index 3814ef44380..c94b76a0606 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
@@ -395,7 +395,7 @@ section .text
; On Android platforms use lrand48 when building postproc routines. Prior to L
; rand() was not available.
-%if CONFIG_POSTPROC=1
+%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
%ifdef __ANDROID__
extern sym(lrand48)
%define LIBVPX_RAND lrand48
@@ -403,4 +403,4 @@ extern sym(lrand48)
extern sym(rand)
%define LIBVPX_RAND rand
%endif
-%endif ; CONFIG_POSTPROC
+%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c
index 5f355c5a6bb..995c45b6ab6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c
@@ -215,7 +215,7 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source,
unsigned int dest_width) {
(void) dest_pitch;
(void) src_pitch;
- vpx_memcpy(dest, source, dest_width);
+ memcpy(dest, source, dest_width);
}
void vp8_vertical_band_2_1_scale_i_c(unsigned char *source,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c
index 8044d2ad776..089e673757c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c
@@ -379,7 +379,7 @@ void Scale2D
vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
if (interpolation)
- vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+ memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
/* Next band... */
source += (unsigned long) source_band_height * source_pitch;
@@ -432,7 +432,7 @@ void Scale2D
temp_area + i * dest_pitch, 1, hratio, dest_width);
} else { /* Duplicate the last row */
/* copy temp_area row 0 over from last row in the past */
- vpx_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+ memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
}
}
@@ -443,7 +443,7 @@ void Scale2D
}
/* copy temp_area row 0 over from last row in the past */
- vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+ memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
/* move to the next band */
source += source_band_height * source_pitch;
@@ -498,11 +498,11 @@ void vpx_scale_frame
if (dw < (int)dst->y_width)
for (i = 0; i < dh; i++)
- vpx_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
+ memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
if (dh < (int)dst->y_height)
for (i = dh - 1; i < (int)dst->y_height; i++)
- vpx_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+ memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
(unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
@@ -510,11 +510,11 @@ void vpx_scale_frame
if (dw / 2 < (int)dst->uv_width)
for (i = 0; i < dst->uv_height; i++)
- vpx_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+ memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
if (dh / 2 < (int)dst->uv_height)
for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
- vpx_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+ memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
(unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
@@ -522,9 +522,9 @@ void vpx_scale_frame
if (dw / 2 < (int)dst->uv_width)
for (i = 0; i < dst->uv_height; i++)
- vpx_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+ memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
if (dh / 2 < (int) dst->uv_height)
for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
- vpx_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+ memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c
index 00a8c163a6a..169c2ab2d73 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c
@@ -38,7 +38,7 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
/* buffer_alloc isn't accessed by most functions. Rather y_buffer,
u_buffer and v_buffer point to buffer_alloc and are used. Clear out
all of this so that a freed pointer isn't inadvertently used */
- vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+ memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
} else {
return -1;
}
@@ -128,7 +128,7 @@ int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
/* buffer_alloc isn't accessed by most functions. Rather y_buffer,
u_buffer and v_buffer point to buffer_alloc and are used. Clear out
all of this so that a freed pointer isn't inadvertently used */
- vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+ memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
} else {
return -1;
}
@@ -143,22 +143,25 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int use_highbitdepth,
#endif
int border,
+ int byte_alignment,
vpx_codec_frame_buffer_t *fb,
vpx_get_frame_buffer_cb_fn_t cb,
void *cb_priv) {
if (ybf) {
+ const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
const int aligned_width = (width + 7) & ~7;
const int aligned_height = (height + 7) & ~7;
const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
const uint64_t yplane_size = (aligned_height + 2 * border) *
- (uint64_t)y_stride;
+ (uint64_t)y_stride + byte_alignment;
const int uv_width = aligned_width >> ss_x;
const int uv_height = aligned_height >> ss_y;
const int uv_stride = y_stride >> ss_x;
const int uv_border_w = border >> ss_x;
const int uv_border_h = border >> ss_y;
const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) *
- (uint64_t)uv_stride;
+ (uint64_t)uv_stride + byte_alignment;
+
#if CONFIG_ALPHA
const int alpha_width = aligned_width;
const int alpha_height = aligned_height;
@@ -166,7 +169,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
const int alpha_border_w = border;
const int alpha_border_h = border;
const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
- (uint64_t)alpha_stride;
+ (uint64_t)alpha_stride + byte_alignment;
#if CONFIG_VP9_HIGHBITDEPTH
const uint64_t frame_size = (1 + use_highbitdepth) *
(yplane_size + 2 * uvplane_size + alpha_plane_size);
@@ -182,6 +185,9 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
const uint64_t frame_size = yplane_size + 2 * uvplane_size;
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_ALPHA
+
+ uint8_t *buf = NULL;
+
if (cb != NULL) {
const int align_addr_extra_size = 31;
const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -216,7 +222,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
// This memset is needed for fixing valgrind error from C loop filter
// due to access uninitialized memory in frame border. It could be
// removed if border is totally removed.
- vpx_memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+ memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
}
/* Only support allocating buffers that have a border that's a multiple
@@ -244,38 +250,33 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
ybf->subsampling_x = ss_x;
ybf->subsampling_y = ss_y;
+ buf = ybf->buffer_alloc;
#if CONFIG_VP9_HIGHBITDEPTH
if (use_highbitdepth) {
// Store uint16 addresses when using 16bit framebuffers
- uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
- ybf->y_buffer = p + (border * y_stride) + border;
- ybf->u_buffer = p + yplane_size +
- (uv_border_h * uv_stride) + uv_border_w;
- ybf->v_buffer = p + yplane_size + uvplane_size +
- (uv_border_h * uv_stride) + uv_border_w;
+ buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
ybf->flags = YV12_FLAG_HIGHBITDEPTH;
} else {
- ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
- ybf->u_buffer = ybf->buffer_alloc + yplane_size +
- (uv_border_h * uv_stride) + uv_border_w;
- ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
- (uv_border_h * uv_stride) + uv_border_w;
ybf->flags = 0;
}
-#else
- ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
- ybf->u_buffer = ybf->buffer_alloc + yplane_size +
- (uv_border_h * uv_stride) + uv_border_w;
- ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
- (uv_border_h * uv_stride) + uv_border_w;
#endif // CONFIG_VP9_HIGHBITDEPTH
+ ybf->y_buffer = (uint8_t *)yv12_align_addr(
+ buf + (border * y_stride) + border, vp9_byte_align);
+ ybf->u_buffer = (uint8_t *)yv12_align_addr(
+ buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+ vp9_byte_align);
+ ybf->v_buffer = (uint8_t *)yv12_align_addr(
+ buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) +
+ uv_border_w, vp9_byte_align);
+
#if CONFIG_ALPHA
ybf->alpha_width = alpha_width;
ybf->alpha_height = alpha_height;
ybf->alpha_stride = alpha_stride;
- ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size +
- (alpha_border_h * alpha_stride) + alpha_border_w;
+ ybf->alpha_buffer = (uint8_t *)yv12_align_addr(
+ buf + yplane_size + 2 * uvplane_size +
+ (alpha_border_h * alpha_stride) + alpha_border_w, vp9_byte_align);
#endif
ybf->corrupted = 0; /* assume not corrupted by errors */
return 0;
@@ -289,14 +290,15 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
#if CONFIG_VP9_HIGHBITDEPTH
int use_highbitdepth,
#endif
- int border) {
+ int border,
+ int byte_alignment) {
if (ybf) {
vp9_free_frame_buffer(ybf);
return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
#if CONFIG_VP9_HIGHBITDEPTH
use_highbitdepth,
#endif
- border, NULL, NULL, NULL);
+ border, byte_alignment, NULL, NULL, NULL);
}
return -2;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c
index 0485452aec3..6214a12189c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c
@@ -31,8 +31,8 @@ static void extend_plane(uint8_t *const src, int src_stride,
uint8_t *dst_ptr2 = src + width;
for (i = 0; i < height; ++i) {
- vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
- vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ memset(dst_ptr2, src_ptr2[0], extend_right);
src_ptr1 += src_stride;
src_ptr2 += src_stride;
dst_ptr1 += src_stride;
@@ -48,12 +48,12 @@ static void extend_plane(uint8_t *const src, int src_stride,
dst_ptr2 = src + src_stride * height - extend_left;
for (i = 0; i < extend_top; ++i) {
- vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+ memcpy(dst_ptr1, src_ptr1, linesize);
dst_ptr1 += src_stride;
}
for (i = 0; i < extend_bottom; ++i) {
- vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+ memcpy(dst_ptr2, src_ptr2, linesize);
dst_ptr2 += src_stride;
}
}
@@ -91,12 +91,12 @@ static void extend_plane_high(uint8_t *const src8, int src_stride,
dst_ptr2 = src + src_stride * height - extend_left;
for (i = 0; i < extend_top; ++i) {
- vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
dst_ptr1 += src_stride;
}
for (i = 0; i < extend_bottom; ++i) {
- vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
dst_ptr2 += src_stride;
}
}
@@ -122,17 +122,17 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
extend_plane_high(
ybf->u_buffer, ybf->uv_stride,
- (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
- ybf->border / 2, ybf->border / 2,
- (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
- (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+ ybf->uv_crop_width, ybf->uv_crop_height,
+ uv_border, uv_border,
+ uv_border + ybf->uv_height - ybf->uv_crop_height,
+ uv_border + ybf->uv_width - ybf->uv_crop_width);
extend_plane_high(
ybf->v_buffer, ybf->uv_stride,
- (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
- ybf->border / 2, ybf->border / 2,
- (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
- (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+ ybf->uv_crop_width, ybf->uv_crop_height,
+ uv_border, uv_border,
+ uv_border + ybf->uv_height - ybf->uv_crop_height,
+ uv_border + ybf->uv_width - ybf->uv_crop_width);
return;
}
#endif
@@ -212,7 +212,7 @@ void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- vpx_memcpy(dst, src, num * sizeof(uint16_t));
+ memcpy(dst, src, num * sizeof(uint16_t));
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP9
@@ -269,7 +269,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
#endif
for (row = 0; row < src_ybc->y_height; ++row) {
- vpx_memcpy(dst, src, src_ybc->y_width);
+ memcpy(dst, src, src_ybc->y_width);
src += src_ybc->y_stride;
dst += dst_ybc->y_stride;
}
@@ -278,7 +278,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
dst = dst_ybc->u_buffer;
for (row = 0; row < src_ybc->uv_height; ++row) {
- vpx_memcpy(dst, src, src_ybc->uv_width);
+ memcpy(dst, src, src_ybc->uv_width);
src += src_ybc->uv_stride;
dst += dst_ybc->uv_stride;
}
@@ -287,7 +287,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
dst = dst_ybc->v_buffer;
for (row = 0; row < src_ybc->uv_height; ++row) {
- vpx_memcpy(dst, src, src_ybc->uv_width);
+ memcpy(dst, src, src_ybc->uv_width);
src += src_ybc->uv_stride;
dst += dst_ybc->uv_stride;
}
@@ -306,7 +306,7 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
for (row = 0; row < src_ybc->y_height; ++row) {
- vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+ memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
src16 += src_ybc->y_stride;
dst16 += dst_ybc->y_stride;
}
@@ -315,7 +315,7 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
#endif
for (row = 0; row < src_ybc->y_height; ++row) {
- vpx_memcpy(dst, src, src_ybc->y_width);
+ memcpy(dst, src, src_ybc->y_width);
src += src_ybc->y_stride;
dst += dst_ybc->y_stride;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
index 0dfc47cc874..19f84cb1960 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
@@ -94,12 +94,12 @@ static void extend_plane(uint8_t *const src, int src_stride,
linesize = extend_left + extend_right + width;
for (i = 0; i < extend_top; i++) {
- vpx_memcpy(top_dst, top_src, linesize);
+ memcpy(top_dst, top_src, linesize);
top_dst += src_stride;
}
for (i = 0; i < extend_bottom; i++) {
- vpx_memcpy(bot_dst, bot_src, linesize);
+ memcpy(bot_dst, bot_src, linesize);
bot_dst += src_stride;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk
index 0a1594bd8f5..a49abf3b4b7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk
@@ -1,11 +1,10 @@
SCALE_SRCS-yes += vpx_scale.mk
SCALE_SRCS-yes += yv12config.h
-SCALE_SRCS-yes += vpx_scale.h
-SCALE_SRCS-yes += generic/vpx_scale.c
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += vpx_scale.h
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/vpx_scale.c
SCALE_SRCS-yes += generic/yv12config.c
SCALE_SRCS-yes += generic/yv12extend.c
SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
-SCALE_SRCS-yes += vpx_scale_asm_offsets.c
SCALE_SRCS-yes += vpx_scale_rtcd.c
SCALE_SRCS-yes += vpx_scale_rtcd.pl
@@ -14,7 +13,4 @@ SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c
SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
-$(eval $(call asm_offsets_template,\
- vpx_scale_asm_offsets.asm, vpx_scale/vpx_scale_asm_offsets.c))
-
$(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c
deleted file mode 100644
index caa9e80ffcb..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_scale/yv12config.h"
-
-BEGIN
-
-/* vpx_scale */
-DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
-DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_NEON
-/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
-ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c
index 656a22f5240..bea603fd104 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -7,9 +7,9 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#define RTCD_C
-#include "vpx_scale_rtcd.h"
+#include "./vpx_scale_rtcd.h"
#include "vpx_ports/vpx_once.h"
void vpx_scale_rtcd()
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c
deleted file mode 100644
index 4336ecea35d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-* Module Title : scaleopt.cpp
-*
-* Description : Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-/****************************************************************************
-* Module Statics
-****************************************************************************/
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-
-#include "vpx_scale/vpx_scale.h"
-#include "vpx_mem/vpx_mem.h"
-
-__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
-__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
-
-
-/****************************************************************************
- *
- * ROUTINE : horizontal_line_5_4_scale_mmx
- *
- * INPUTS : const unsigned char *source : Pointer to source data.
- * unsigned int source_width : Stride of source.
- * unsigned char *dest : Pointer to destination data.
- * unsigned int dest_width : Stride of destination (NOT USED).
- *
- * OUTPUTS : None.
- *
- * RETURNS : void
- *
- * FUNCTION : Copies horizontal line of pixels from source to
- * destination scaling up by 4 to 5.
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-) {
- /*
- unsigned i;
- unsigned int a, b, c, d, e;
- unsigned char *des = dest;
- const unsigned char *src = source;
-
- (void) dest_width;
-
- for ( i=0; i<source_width; i+=5 )
- {
- a = src[0];
- b = src[1];
- c = src[2];
- d = src[3];
- e = src[4];
-
- des[0] = a;
- des[1] = ((b*192 + c* 64 + 128)>>8);
- des[2] = ((c*128 + d*128 + 128)>>8);
- des[3] = ((d* 64 + e*192 + 128)>>8);
-
- src += 5;
- des += 4;
- }
- */
- (void) dest_width;
-
- __asm {
-
- mov esi, source;
- mov edi, dest;
-
- mov ecx, source_width;
- movq mm5, const54_1;
-
- pxor mm7, mm7;
- movq mm6, const54_2;
-
- movq mm4, round_values;
- lea edx, [esi+ecx];
- horizontal_line_5_4_loop:
-
- movq mm0, QWORD PTR [esi];
- 00 01 02 03 04 05 06 07
- movq mm1, mm0;
- 00 01 02 03 04 05 06 07
-
- psrlq mm0, 8;
- 01 02 03 04 05 06 07 xx
- punpcklbw mm1, mm7;
- xx 00 xx 01 xx 02 xx 03
-
- punpcklbw mm0, mm7;
- xx 01 xx 02 xx 03 xx 04
- pmullw mm1, mm5
-
- pmullw mm0, mm6
- add esi, 5
-
- add edi, 4
- paddw mm1, mm0
-
- paddw mm1, mm4
- psrlw mm1, 8
-
- cmp esi, edx
- packuswb mm1, mm7
-
- movd DWORD PTR [edi-4], mm1
-
- jl horizontal_line_5_4_loop
-
- }
-
-}
-__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
-__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-static
-void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
- __asm {
- push ebx
-
- mov esi, source // Get the source and destination pointer
- mov ecx, src_pitch // Get the pitch size
-
- mov edi, dest // tow lines below
- pxor mm7, mm7 // clear out mm7
-
- mov edx, dest_pitch // Loop counter
- mov ebx, dest_width
-
- vs_5_4_loop:
-
- movd mm0, DWORD ptr [esi] // src[0];
- movd mm1, DWORD ptr [esi+ecx] // src[1];
-
- movd mm2, DWORD ptr [esi+ecx*2]
- lea eax, [esi+ecx*2] //
-
- punpcklbw mm1, mm7
- punpcklbw mm2, mm7
-
- movq mm3, mm2
- pmullw mm1, three_fourths
-
- pmullw mm2, one_fourths
- movd mm4, [eax+ecx]
-
- pmullw mm3, two_fourths
- punpcklbw mm4, mm7
-
- movq mm5, mm4
- pmullw mm4, two_fourths
-
- paddw mm1, mm2
- movd mm6, [eax+ecx*2]
-
- pmullw mm5, one_fourths
- paddw mm1, round_values;
-
- paddw mm3, mm4
- psrlw mm1, 8
-
- punpcklbw mm6, mm7
- paddw mm3, round_values
-
- pmullw mm6, three_fourths
- psrlw mm3, 8
-
- packuswb mm1, mm7
- packuswb mm3, mm7
-
- movd DWORD PTR [edi], mm0
- movd DWORD PTR [edi+edx], mm1
-
-
- paddw mm5, mm6
- movd DWORD PTR [edi+edx*2], mm3
-
- lea eax, [edi+edx*2]
- paddw mm5, round_values
-
- psrlw mm5, 8
- add edi, 4
-
- packuswb mm5, mm7
- movd DWORD PTR [eax+edx], mm5
-
- add esi, 4
- sub ebx, 4
-
- jg vs_5_4_loop
-
- pop ebx
- }
-}
-
-
-__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
-__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-) {
-
- (void) dest_width;
- __asm {
-
- mov esi, source;
- mov edi, dest;
-
- mov ecx, source_width;
- movq mm5, const53_1;
-
- pxor mm7, mm7;
- movq mm6, const53_2;
-
- movq mm4, round_values;
- lea edx, [esi+ecx-5];
- horizontal_line_5_3_loop:
-
- movq mm0, QWORD PTR [esi];
- 00 01 02 03 04 05 06 07
- movq mm1, mm0;
- 00 01 02 03 04 05 06 07
-
- psllw mm0, 8;
- xx 00 xx 02 xx 04 xx 06
- psrlw mm1, 8;
- 01 xx 03 xx 05 xx 07 xx
-
- psrlw mm0, 8;
- 00 xx 02 xx 04 xx 06 xx
- psllq mm1, 16;
- xx xx 01 xx 03 xx 05 xx
-
- pmullw mm0, mm6
-
- pmullw mm1, mm5
- add esi, 5
-
- add edi, 3
- paddw mm1, mm0
-
- paddw mm1, mm4
- psrlw mm1, 8
-
- cmp esi, edx
- packuswb mm1, mm7
-
- movd DWORD PTR [edi-3], mm1
- jl horizontal_line_5_3_loop
-
-// exit condition
- movq mm0, QWORD PTR [esi];
- 00 01 02 03 04 05 06 07
- movq mm1, mm0;
- 00 01 02 03 04 05 06 07
-
- psllw mm0, 8;
- xx 00 xx 02 xx 04 xx 06
- psrlw mm1, 8;
- 01 xx 03 xx 05 xx 07 xx
-
- psrlw mm0, 8;
- 00 xx 02 xx 04 xx 06 xx
- psllq mm1, 16;
- xx xx 01 xx 03 xx 05 xx
-
- pmullw mm0, mm6
-
- pmullw mm1, mm5
- paddw mm1, mm0
-
- paddw mm1, mm4
- psrlw mm1, 8
-
- packuswb mm1, mm7
- movd eax, mm1
-
- mov edx, eax
- shr edx, 16
-
- mov WORD PTR[edi], ax
- mov BYTE PTR[edi+2], dl
-
- }
-
-}
-
-__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
-__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-static
-void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
- __asm {
- push ebx
-
- mov esi, source // Get the source and destination pointer
- mov ecx, src_pitch // Get the pitch size
-
- mov edi, dest // tow lines below
- pxor mm7, mm7 // clear out mm7
-
- mov edx, dest_pitch // Loop counter
- movq mm5, one_thirds
-
- movq mm6, two_thirds
- mov ebx, dest_width;
-
- vs_5_3_loop:
-
- movd mm0, DWORD ptr [esi] // src[0];
- movd mm1, DWORD ptr [esi+ecx] // src[1];
-
- movd mm2, DWORD ptr [esi+ecx*2]
- lea eax, [esi+ecx*2] //
-
- punpcklbw mm1, mm7
- punpcklbw mm2, mm7
-
- pmullw mm1, mm5
- pmullw mm2, mm6
-
- movd mm3, DWORD ptr [eax+ecx]
- movd mm4, DWORD ptr [eax+ecx*2]
-
- punpcklbw mm3, mm7
- punpcklbw mm4, mm7
-
- pmullw mm3, mm6
- pmullw mm4, mm5
-
-
- movd DWORD PTR [edi], mm0
- paddw mm1, mm2
-
- paddw mm1, round_values
- psrlw mm1, 8
-
- packuswb mm1, mm7
- paddw mm3, mm4
-
- paddw mm3, round_values
- movd DWORD PTR [edi+edx], mm1
-
- psrlw mm3, 8
- packuswb mm3, mm7
-
- movd DWORD PTR [edi+edx*2], mm3
-
-
- add edi, 4
- add esi, 4
-
- sub ebx, 4
- jg vs_5_3_loop
-
- pop ebx
- }
-}
-
-
-
-
-/****************************************************************************
- *
- * ROUTINE : horizontal_line_2_1_scale
- *
- * INPUTS : const unsigned char *source :
- * unsigned int source_width :
- * unsigned char *dest :
- * unsigned int dest_width :
- *
- * OUTPUTS : None.
- *
- * RETURNS : void
- *
- * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-) {
- (void) dest_width;
- (void) source_width;
- __asm {
- mov esi, source
- mov edi, dest
-
- pxor mm7, mm7
- mov ecx, dest_width
-
- xor edx, edx
- hs_2_1_loop:
-
- movq mm0, [esi+edx*2]
- psllw mm0, 8
-
- psrlw mm0, 8
- packuswb mm0, mm7
-
- movd DWORD Ptr [edi+edx], mm0;
- add edx, 4
-
- cmp edx, ecx
- jl hs_2_1_loop
-
- }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
- (void) dest_pitch;
- (void) src_pitch;
- vpx_memcpy(dest, source, dest_width);
-}
-
-
-__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
-__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
-
-static
-void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
- (void) dest_pitch;
- __asm {
- mov esi, source
- mov edi, dest
-
- mov eax, src_pitch
- mov edx, dest_width
-
- pxor mm7, mm7
- sub esi, eax // back one line
-
-
- lea ecx, [esi+edx];
- movq mm6, round_values;
-
- movq mm5, three_sixteenths;
- movq mm4, ten_sixteenths;
-
- vs_2_1_i_loop:
- movd mm0, [esi] //
- movd mm1, [esi+eax] //
-
- movd mm2, [esi+eax*2] //
- punpcklbw mm0, mm7
-
- pmullw mm0, mm5
- punpcklbw mm1, mm7
-
- pmullw mm1, mm4
- punpcklbw mm2, mm7
-
- pmullw mm2, mm5
- paddw mm0, round_values
-
- paddw mm1, mm2
- paddw mm0, mm1
-
- psrlw mm0, 8
- packuswb mm0, mm7
-
- movd DWORD PTR [edi], mm0
- add esi, 4
-
- add edi, 4;
- cmp esi, ecx
- jl vs_2_1_i_loop
-
- }
-}
-
-
-
-void
-register_mmxscalers(void) {
- vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
- vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
- vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
- vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
- vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
- vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
- vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h b/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h
index b9f13fd8918..76cf771c74a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h
@@ -55,12 +55,13 @@ typedef struct yv12_buffer_config {
int subsampling_x;
int subsampling_y;
unsigned int bit_depth;
+ vpx_color_space_t color_space;
int corrupted;
int flags;
} YV12_BUFFER_CONFIG;
-#define YV12_FLAG_HIGHBITDEPTH 1
+#define YV12_FLAG_HIGHBITDEPTH 8
int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int border);
@@ -73,9 +74,10 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
#if CONFIG_VP9_HIGHBITDEPTH
int use_highbitdepth,
#endif
- int border);
+ int border, int byte_alignment);
-// Updates the yv12 buffer config with the frame buffer. If cb is not
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
// NULL, then libvpx is using the frame buffer callbacks to handle memory.
// If cb is not NULL, libvpx will call cb with minimum size in bytes needed
// to decode the current frame. If cb is NULL, libvpx will allocate memory
@@ -87,6 +89,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int use_highbitdepth,
#endif
int border,
+ int byte_alignment,
vpx_codec_frame_buffer_t *fb,
vpx_get_frame_buffer_cb_fn_t cb,
void *cb_priv);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
index 2afdb715b82..8c938df8de6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
@@ -75,6 +75,8 @@ static const arg_def_t outputfile = ARG_DEF(
"o", "output", 1, "Output file name pattern (see below)");
static const arg_def_t threadsarg = ARG_DEF(
"t", "threads", 1, "Max threads to use");
+static const arg_def_t frameparallelarg = ARG_DEF(
+ NULL, "frame-parallel", 0, "Frame parallel decode");
static const arg_def_t verbosearg = ARG_DEF(
"v", "verbose", 0, "Show version string");
static const arg_def_t error_concealment = ARG_DEF(
@@ -95,7 +97,7 @@ static const arg_def_t outbitdeptharg = ARG_DEF(
static const arg_def_t *all_args[] = {
&codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
&progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
- &threadsarg, &verbosearg, &scalearg, &fb_arg,
+ &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg,
&md5arg, &error_concealment, &continuearg,
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
&outbitdeptharg,
@@ -131,7 +133,7 @@ static const arg_def_t *vp8_pp_args[] = {
#endif
#if CONFIG_LIBYUV
-static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
FilterModeEnum mode) {
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
if (src->fmt == VPX_IMG_FMT_I42016) {
@@ -276,7 +278,8 @@ static void update_image_md5(const vpx_image_t *img, const int planes[3],
const int plane = planes[i];
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane);
+ const int w = vpx_img_plane_width(img, plane) *
+ ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
const int h = vpx_img_plane_height(img, plane);
for (y = 0; y < h; ++y) {
@@ -518,7 +521,7 @@ static FILE *open_outfile(const char *name) {
} else {
FILE *file = fopen(name, "wb");
if (!file)
- fatal("Failed to output file %s", name);
+ fatal("Failed to open output file '%s'", name);
return file;
}
}
@@ -541,7 +544,7 @@ int main_loop(int argc, const char **argv_) {
size_t bytes_in_buffer = 0, buffer_size = 0;
FILE *infile;
int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
- int do_md5 = 0, progress = 0;
+ int do_md5 = 0, progress = 0, frame_parallel = 0;
int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
int arg_skip = 0;
int ec_enabled = 0;
@@ -574,7 +577,7 @@ int main_loop(int argc, const char **argv_) {
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
vpx_image_t *img_shifted = NULL;
#endif
- int frame_avail, got_data;
+ int frame_avail, got_data, flush_decoder = 0;
int num_external_frame_buffers = 0;
struct ExternalFrameBufferList ext_fb_list = {0, NULL};
@@ -641,6 +644,10 @@ int main_loop(int argc, const char **argv_) {
summary = 1;
else if (arg_match(&arg, &threadsarg, argi))
cfg.threads = arg_parse_uint(&arg);
+#if CONFIG_VP9_DECODER
+ else if (arg_match(&arg, &frameparallelarg, argi))
+ frame_parallel = 1;
+#endif
else if (arg_match(&arg, &verbosearg, argi))
quiet = 0;
else if (arg_match(&arg, &scalearg, argi))
@@ -717,15 +724,15 @@ int main_loop(int argc, const char **argv_) {
/* Handle non-option arguments */
fn = argv[0];
- if (!fn)
+ if (!fn) {
+ free(argv);
usage_exit();
-
+ }
/* Open file */
infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
if (!infile) {
- fprintf(stderr, "Failed to open file '%s'", strcmp(fn, "-") ? fn : "stdin");
- return EXIT_FAILURE;
+ fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
}
#if CONFIG_OS_SUPPORT
/* Make sure we don't dump to the terminal, unless forced to with -o - */
@@ -793,7 +800,8 @@ int main_loop(int argc, const char **argv_) {
interface = get_vpx_decoder_by_index(0);
dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
- (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
+ (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
+ (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
if (vpx_codec_dec_init(&decoder, interface->codec_interface(),
&cfg, dec_flags)) {
fprintf(stderr, "Failed to initialize decoder: %s\n",
@@ -867,7 +875,7 @@ int main_loop(int argc, const char **argv_) {
vpx_codec_iter_t iter = NULL;
vpx_image_t *img;
struct vpx_usec_timer timer;
- int corrupted;
+ int corrupted = 0;
frame_avail = 0;
if (!stop_after || frame_in < stop_after) {
@@ -891,11 +899,22 @@ int main_loop(int argc, const char **argv_) {
vpx_usec_timer_mark(&timer);
dx_time += vpx_usec_timer_elapsed(&timer);
+ } else {
+ flush_decoder = 1;
}
+ } else {
+ flush_decoder = 1;
}
vpx_usec_timer_start(&timer);
+ if (flush_decoder) {
+ // Flush the decoder in frame parallel decode.
+ if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
+ warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
+ }
+ }
+
got_data = 0;
if ((img = vpx_codec_get_frame(&decoder, &iter))) {
++frame_out;
@@ -905,9 +924,11 @@ int main_loop(int argc, const char **argv_) {
vpx_usec_timer_mark(&timer);
dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
- if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
+ if (!frame_parallel &&
+ vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
- goto fail;
+ if (!keep_going)
+ goto fail;
}
frames_corrupted += corrupted;
@@ -947,7 +968,7 @@ int main_loop(int argc, const char **argv_) {
if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
#if CONFIG_LIBYUV
- vpx_image_scale(img, scaled_img, kFilterBox);
+ libyuv_scale(img, scaled_img, kFilterBox);
img = scaled_img;
#else
fprintf(stderr, "Failed to scale output frame: %s.\n"
@@ -1059,9 +1080,6 @@ int main_loop(int argc, const char **argv_) {
}
}
}
-
- if (stop_after && frame_in >= stop_after)
- break;
}
if (summary || progress) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
index 0a0c0718bc5..851d43291cd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
@@ -183,8 +183,10 @@ static const arg_def_t recontest = ARG_DEF_ENUM(
NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
static const arg_def_t framerate = ARG_DEF(
NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm = ARG_DEF(
+ NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
static const arg_def_t use_ivf = ARG_DEF(
- NULL, "ivf", 0, "Output IVF (default is WebM if WebM IO is enabled)");
+ NULL, "ivf", 0, "Output IVF");
static const arg_def_t out_part = ARG_DEF(
"P", "output-partitions", 0,
"Makes encoder output partitions. Requires IVF output!");
@@ -208,7 +210,7 @@ static const arg_def_t *main_args[] = {
&debugmode,
&outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip,
&deadline, &best_dl, &good_dl, &rt_dl,
- &quietarg, &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n,
+ &quietarg, &verbosearg, &psnrarg, &use_webm, &use_ivf, &out_part, &q_hist_n,
&rate_hist_n, &disable_warnings, &disable_warning_prompt,
NULL
};
@@ -328,8 +330,10 @@ static const arg_def_t sharpness = ARG_DEF(
NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
static const arg_def_t static_thresh = ARG_DEF(
NULL, "static-thresh", 1, "Motion detection threshold");
-static const arg_def_t cpu_used = ARG_DEF(
+static const arg_def_t cpu_used_vp8 = ARG_DEF(
NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t cpu_used_vp9 = ARG_DEF(
+ NULL, "cpu-used", 1, "CPU Used (-8..8)");
static const arg_def_t auto_altref = ARG_DEF(
NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
static const arg_def_t arnr_maxframes = ARG_DEF(
@@ -349,14 +353,21 @@ static const arg_def_t cq_level = ARG_DEF(
NULL, "cq-level", 1, "Constant/Constrained Quality level");
static const arg_def_t max_intra_rate_pct = ARG_DEF(
NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+static const arg_def_t max_inter_rate_pct = ARG_DEF(
+ NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+ NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
+
+static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1,
+ "Screen content mode");
#if CONFIG_VP8_ENCODER
static const arg_def_t token_parts = ARG_DEF(
NULL, "token-parts", 1, "Number of token partitions to use, log2");
static const arg_def_t *vp8_args[] = {
- &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
+ &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, &static_thresh,
&token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
- &tune_ssim, &cq_level, &max_intra_rate_pct,
+ &tune_ssim, &cq_level, &max_intra_rate_pct, &screen_content_mode,
NULL
};
static const int vp8_arg_ctrl_map[] = {
@@ -365,6 +376,7 @@ static const int vp8_arg_ctrl_map[] = {
VP8E_SET_TOKEN_PARTITIONS,
VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+ VP8E_SET_SCREEN_CONTENT_MODE,
0
};
#endif
@@ -386,6 +398,22 @@ static const arg_def_t frame_periodic_boost = ARG_DEF(
NULL, "frame-boost", 1,
"Enable frame periodic boost (0: off (default), 1: on)");
+static const struct arg_enum_list color_space_enum[] = {
+ { "unknown", VPX_CS_UNKNOWN },
+ { "bt601", VPX_CS_BT_601 },
+ { "bt709", VPX_CS_BT_709 },
+ { "smpte170", VPX_CS_SMPTE_170 },
+ { "smpte240", VPX_CS_SMPTE_240 },
+ { "bt2020", VPX_CS_BT_2020 },
+ { "reserved", VPX_CS_RESERVED },
+ { "sRGB", VPX_CS_SRGB },
+ { NULL, 0 }
+};
+
+static const arg_def_t input_color_space = ARG_DEF_ENUM(
+ NULL, "color-space", 1,
+ "The color space of input content:", color_space_enum);
+
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
static const struct arg_enum_list bitdepth_enum[] = {
{"8", VPX_BITS_8},
@@ -412,11 +440,12 @@ static const arg_def_t tune_content = ARG_DEF_ENUM(
NULL, "tune-content", 1, "Tune content type", tune_content_enum);
static const arg_def_t *vp9_args[] = {
- &cpu_used, &auto_altref, &sharpness, &static_thresh,
+ &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
&tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
- &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
+ &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
+ &gf_cbr_boost_pct, &lossless,
&frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
- &noise_sens, &tune_content,
+ &noise_sens, &tune_content, &input_color_space,
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
&bitdeptharg, &inbitdeptharg,
#endif
@@ -428,9 +457,10 @@ static const int vp9_arg_ctrl_map[] = {
VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+ VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
- VP9E_SET_TUNE_CONTENT,
+ VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
0
};
#endif
@@ -1038,8 +1068,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
continue;
}
- if (0) {
- } else if (arg_match(&arg, &outputfile, argi)) {
+ if (arg_match(&arg, &outputfile, argi)) {
config->out_fn = arg.val;
} else if (arg_match(&arg, &fpf_name, argi)) {
config->stats_fn = arg.val;
@@ -1047,6 +1076,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
} else if (arg_match(&arg, &fpmbf_name, argi)) {
config->fpmb_stats_fn = arg.val;
#endif
+ } else if (arg_match(&arg, &use_webm, argi)) {
+#if CONFIG_WEBM_IO
+ config->write_webm = 1;
+#else
+ die("Error: --webm specified but webm is disabled.");
+#endif
} else if (arg_match(&arg, &use_ivf, argi)) {
config->write_webm = 0;
} else if (arg_match(&arg, &threads, argi)) {
@@ -1187,6 +1222,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
static void validate_stream_config(const struct stream_state *stream,
const struct VpxEncoderConfig *global) {
const struct stream_state *streami;
+ (void)global;
if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
fatal("Stream %d: Specify stream dimensions with --width (-w) "
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxstats.c b/chromium/third_party/libvpx/source/libvpx/vpxstats.c
index 5f88f8d3597..172d8937cdc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxstats.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxstats.c
@@ -41,6 +41,9 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
stats->file = fopen(fpf, "rb");
+ if (stats->file == NULL)
+ fatal("First-pass stats file does not exist!");
+
if (fseek(stats->file, 0, SEEK_END))
fatal("First-pass stats file must be seekable!");
diff --git a/chromium/third_party/libvpx/source/libvpx/webmdec.cc b/chromium/third_party/libvpx/source/libvpx/webmdec.cc
index 4383e8efd8d..e152f5ee03a 100644
--- a/chromium/third_party/libvpx/source/libvpx/webmdec.cc
+++ b/chromium/third_party/libvpx/source/libvpx/webmdec.cc
@@ -41,6 +41,7 @@ void reset(struct WebmInputContext *const webm_ctx) {
webm_ctx->block_frame_index = 0;
webm_ctx->video_track_index = 0;
webm_ctx->timestamp_ns = 0;
+ webm_ctx->is_key_frame = false;
}
void get_first_cluster(struct WebmInputContext *const webm_ctx) {
@@ -62,6 +63,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
struct VpxInputContext *vpx_ctx) {
mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
webm_ctx->reader = reader;
+ webm_ctx->reached_eos = 0;
mkvparser::EBMLHeader header;
long long pos = 0;
@@ -120,6 +122,11 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
uint8_t **buffer,
size_t *bytes_in_buffer,
size_t *buffer_size) {
+ // This check is needed for frame parallel decoding, in which case this
+ // function could be called even after it has reached end of input stream.
+ if (webm_ctx->reached_eos) {
+ return 1;
+ }
mkvparser::Segment *const segment =
reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
const mkvparser::Cluster* cluster =
@@ -139,6 +146,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
cluster = segment->GetNext(cluster);
if (cluster == NULL || cluster->EOS()) {
*bytes_in_buffer = 0;
+ webm_ctx->reached_eos = 1;
return 1;
}
status = cluster->GetFirst(block_entry);
@@ -182,6 +190,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
}
*bytes_in_buffer = frame.len;
webm_ctx->timestamp_ns = block->GetTime(cluster);
+ webm_ctx->is_key_frame = block->IsKey();
mkvparser::MkvReader *const reader =
reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
@@ -210,6 +219,7 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
webm_ctx->block_entry = NULL;
webm_ctx->block_frame_index = 0;
webm_ctx->timestamp_ns = 0;
+ webm_ctx->reached_eos = 0;
return 0;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/webmdec.h b/chromium/third_party/libvpx/source/libvpx/webmdec.h
index 29b815da125..7d163803552 100644
--- a/chromium/third_party/libvpx/source/libvpx/webmdec.h
+++ b/chromium/third_party/libvpx/source/libvpx/webmdec.h
@@ -28,6 +28,8 @@ struct WebmInputContext {
int block_frame_index;
int video_track_index;
uint64_t timestamp_ns;
+ int is_key_frame;
+ int reached_eos;
};
// Checks if the input is a WebM file. If so, initializes WebMInputContext so