diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-07-17 13:57:45 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-07-19 13:44:40 +0000 |
commit | 6ec7b8da05d21a3878bd21c691b41e675d74bb1c (patch) | |
tree | b87f250bc19413750b9bb9cdbf2da20ef5014820 /chromium/third_party/libvpx | |
parent | ec02ee4181c49b61fce1c8fb99292dbb8139cc90 (diff) | |
download | qtwebengine-chromium-6ec7b8da05d21a3878bd21c691b41e675d74bb1c.tar.gz |
BASELINE: Update Chromium to 60.0.3112.70
Change-Id: I9911c2280a014d4632f254857876a395d4baed2d
Reviewed-by: Alexandru Croitor <alexandru.croitor@qt.io>
Diffstat (limited to 'chromium/third_party/libvpx')
134 files changed, 6770 insertions, 3356 deletions
diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium index 414280e6fbf..04399d3d7f6 100644 --- a/chromium/third_party/libvpx/README.chromium +++ b/chromium/third_party/libvpx/README.chromium @@ -5,9 +5,9 @@ License: BSD License File: source/libvpx/LICENSE Security Critical: yes -Date: Monday April 10 2017 +Date: Monday May 22 2017 Branch: master -Commit: f22b828d685adee4c7a561990302e2d21b5e0047 +Commit: b3bf91bdc60220c004a22d21c867cc392e684b81 Description: Contains the sources used to compile libvpx binaries used by Google Chrome and diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni index a39f4572712..08b2a5c2d75 100644 --- a/chromium/third_party/libvpx/libvpx_srcs.gni +++ b/chromium/third_party/libvpx/libvpx_srcs.gni @@ -324,7 +324,9 @@ libvpx_srcs_x86 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/txfm_common_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_asm_stubs.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", @@ -372,7 +374,6 @@ libvpx_srcs_x86_assembly = [ "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm", @@ -414,7 +415,12 @@ libvpx_srcs_x86_sse2 = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c", @@ -432,12 +438,15 @@ libvpx_srcs_x86_ssse3 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c", ] -libvpx_srcs_x86_sse4_1 = - [ "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c" ] +libvpx_srcs_x86_sse4_1 = [ + "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c", +] libvpx_srcs_x86_avx = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c" ] libvpx_srcs_x86_avx2 = [ - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/loopfilter_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c", @@ -766,7 +775,9 @@ libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/txfm_common_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_asm_stubs.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", @@ -816,7 +827,6 @@ libvpx_srcs_x86_64_assembly = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm", @@ -863,7 +873,12 @@ libvpx_srcs_x86_64_sse2 = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c", @@ -881,12 +896,15 @@ libvpx_srcs_x86_64_ssse3 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c", ] -libvpx_srcs_x86_64_sse4_1 = - [ "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c" ] +libvpx_srcs_x86_64_sse4_1 = [ + "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c", +] libvpx_srcs_x86_64_avx = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c" ] libvpx_srcs_x86_64_avx2 = [ - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/loopfilter_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c", @@ -1434,6 +1452,7 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c", "//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_alt_ref_aq.c", @@ -1533,6 +1552,7 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c", @@ -1545,6 +1565,7 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c", @@ -1903,6 +1924,7 @@ libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/avg.c", "//third_party/libvpx/source/libvpx/vpx_dsp/bitreader.c", @@ -2000,10 +2022,12 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [ "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c", @@ -2226,6 +2250,7 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c", "//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", + "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_alt_ref_aq.c", @@ -2325,6 +2350,7 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c", @@ -2340,6 +2366,7 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c", diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h index 957219d5a8a..3b104550883 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in #define vp9_block_error_fp vp9_block_error_fp_neon int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); -#define vp9_denoiser_filter vp9_denoiser_filter_c +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +#define vp9_denoiser_filter vp9_denoiser_filter_neon int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h index 6d960874198..61c4f1fe737 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h @@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_c +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c @@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str #define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c @@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c #define vpx_variance16x16 vpx_variance16x16_neon unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_c +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_neon unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x8 vpx_variance16x8_neon unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x16 vpx_variance32x16_c +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_neon unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co #define vpx_variance8x16 vpx_variance8x16_neon unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_c +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_neon unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h index 957219d5a8a..3b104550883 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in #define vp9_block_error_fp vp9_block_error_fp_neon int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); -#define vp9_denoiser_filter vp9_denoiser_filter_c +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +#define vp9_denoiser_filter vp9_denoiser_filter_neon int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h index 6d960874198..61c4f1fe737 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h @@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_c +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c @@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str #define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c @@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c #define vpx_variance16x16 vpx_variance16x16_neon unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_c +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_neon unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x8 vpx_variance16x8_neon unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x16 vpx_variance32x16_c +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_neon unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co #define vpx_variance8x16 vpx_variance8x16_neon unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_c +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_neon unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h index 9129bf63688..015772d2b4b 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size); int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); -#define vp9_denoiser_filter vp9_denoiser_filter_c +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +RTCD_EXTERN int (*vp9_denoiser_filter)(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -85,10 +87,10 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); @@ -105,6 +107,8 @@ static void setup_rtcd_internal(void) vp9_block_error_fp = vp9_block_error_fp_c; if (flags & HAS_NEON) vp9_block_error_fp = vp9_block_error_fp_neon; + vp9_denoiser_filter = vp9_denoiser_filter_c; + if (flags & HAS_NEON) vp9_denoiser_filter = vp9_denoiser_filter_neon; vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; if (flags & HAS_NEON) vp9_fdct8x8_quant = vp9_fdct8x8_quant_neon; vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h index f8e41363a38..c818a5184df 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h @@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_c +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c @@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c @@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_c +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x16 vpx_variance32x16_c +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_c +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -887,6 +898,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon; vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_NEON) vpx_fdct4x4 = vpx_fdct4x4_neon; vpx_fdct8x8 = vpx_fdct8x8_c; if (flags & HAS_NEON) vpx_fdct8x8 = vpx_fdct8x8_neon; vpx_fdct8x8_1 = vpx_fdct8x8_1_c; @@ -997,10 +1010,24 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_satd = vpx_satd_neon; vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; if (flags & HAS_NEON) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_neon; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_neon; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_neon; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_neon; vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; if (flags & HAS_NEON) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_neon; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_neon; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_neon; vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c; if (flags & HAS_NEON) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_neon; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_neon; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_neon; vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; if (flags & HAS_NEON) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_neon; vpx_subtract_block = vpx_subtract_block_c; @@ -1023,8 +1050,12 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon; vpx_variance16x16 = vpx_variance16x16_c; if (flags & HAS_NEON) vpx_variance16x16 = vpx_variance16x16_neon; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_NEON) vpx_variance16x32 = vpx_variance16x32_neon; vpx_variance16x8 = vpx_variance16x8_c; if (flags & HAS_NEON) vpx_variance16x8 = vpx_variance16x8_neon; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_NEON) vpx_variance32x16 = vpx_variance32x16_neon; vpx_variance32x32 = vpx_variance32x32_c; if (flags & HAS_NEON) vpx_variance32x32 = vpx_variance32x32_neon; vpx_variance32x64 = vpx_variance32x64_c; @@ -1035,6 +1066,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_variance64x64 = vpx_variance64x64_neon; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_NEON) vpx_variance8x16 = vpx_variance8x16_neon; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_NEON) vpx_variance8x4 = vpx_variance8x4_neon; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_NEON) vpx_variance8x8 = vpx_variance8x8_neon; vpx_vector_var = vpx_vector_var_c; diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h index 957219d5a8a..3b104550883 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in #define vp9_block_error_fp vp9_block_error_fp_neon int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); -#define vp9_denoiser_filter vp9_denoiser_filter_c +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +#define vp9_denoiser_filter vp9_denoiser_filter_neon int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h index 6d960874198..61c4f1fe737 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h @@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_c +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c @@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str #define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c @@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c #define vpx_variance16x16 vpx_variance16x16_neon unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_c +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_neon unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x8 vpx_variance16x8_neon unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x16 vpx_variance32x16_c +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_neon unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co #define vpx_variance8x16 vpx_variance8x16_neon unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_c +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_neon unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h index 5d9d14d08a0..e259775c0e3 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -80,10 +81,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h index 957219d5a8a..3b104550883 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in #define vp9_block_error_fp vp9_block_error_fp_neon int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); -#define vp9_denoiser_filter vp9_denoiser_filter_c +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +#define vp9_denoiser_filter vp9_denoiser_filter_neon int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c @@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h index 6d960874198..61c4f1fe737 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h @@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct32x32_rd vpx_fdct32x32_rd_c void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_c +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct4x4_1 vpx_fdct4x4_1_c @@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str #define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c @@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c #define vpx_variance16x16 vpx_variance16x16_neon unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_c +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_neon unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x8 vpx_variance16x8_neon unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x16 vpx_variance32x16_c +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_neon unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); @@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co #define vpx_variance8x16 vpx_variance8x16_neon unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_c +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_neon unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h index 4a32a38e064..0e14191aaec 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -80,13 +81,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -104,7 +105,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -122,10 +123,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h index 8d126acaa9f..a09ed559657 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h @@ -652,28 +652,28 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8 vpx_highbd_convolve8_c -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); @@ -832,49 +832,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h index 6b04a45895d..c178d191672 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -141,13 +142,13 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); void vp9_rtcd(void); @@ -198,7 +199,7 @@ static void setup_rtcd_internal(void) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; - if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1; } #endif diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h index 2ebbf6e3fa3..49450cda3db 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h @@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -824,31 +825,39 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8 vpx_highbd_convolve8_c +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c @@ -1015,56 +1024,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -2030,6 +2039,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; @@ -2360,10 +2371,24 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_highbd_8_variance8x16 = vpx_highbd_8_variance8x16_sse2; vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_c; if (flags & HAS_SSE2) vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_sse2; + vpx_highbd_convolve8 = vpx_highbd_convolve8_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2; + vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2; + vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2; + vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2; + vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2; + vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2; vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_c; if (flags & HAS_SSE2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2; vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_c; if (flags & HAS_SSE2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2; vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_c; if (flags & HAS_SSE2) vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_sse2; vpx_highbd_dc_predictor_32x32 = vpx_highbd_dc_predictor_32x32_c; diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h index c0174f2ffa8..c7f905eb1e8 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -80,10 +81,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h index c0174f2ffa8..c7f905eb1e8 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -80,10 +81,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h index 58a2d4e7268..56d5840ce95 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -143,13 +144,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); void vp9_rtcd(void); @@ -176,6 +177,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; + vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; + if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1; } #endif diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h index 889fca7c45a..b2403c36bc4 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h @@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_sse2 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -825,37 +826,45 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2 - -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2 - -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2 - -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2 - -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2 - -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2 - -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2 - -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2 +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c @@ -1022,56 +1031,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -2082,6 +2091,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_highbd_convolve8 = vpx_highbd_convolve8_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2; + vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2; + vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2; + vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2; + vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2; + vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2; + vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2; + vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3; vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h index 6b04a45895d..c178d191672 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -141,13 +142,13 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); void vp9_rtcd(void); @@ -198,7 +199,7 @@ static void setup_rtcd_internal(void) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; - if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1; } #endif diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h index 2ebbf6e3fa3..49450cda3db 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h @@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -824,31 +825,39 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8 vpx_highbd_convolve8_c +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c @@ -1015,56 +1024,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -2030,6 +2039,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; @@ -2360,10 +2371,24 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_highbd_8_variance8x16 = vpx_highbd_8_variance8x16_sse2; vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_c; if (flags & HAS_SSE2) vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_sse2; + vpx_highbd_convolve8 = vpx_highbd_convolve8_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2; + vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2; + vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2; + vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2; + vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2; + vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2; vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_c; if (flags & HAS_SSE2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2; vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_c; if (flags & HAS_SSE2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2; vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_c; if (flags & HAS_SSE2) vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_sse2; vpx_highbd_dc_predictor_32x32 = vpx_highbd_dc_predictor_32x32_c; diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h index 58a2d4e7268..56d5840ce95 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -143,13 +144,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); void vp9_rtcd(void); @@ -176,6 +177,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; + vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; + if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1; } #endif diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h index 889fca7c45a..b2403c36bc4 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h @@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_sse2 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -825,37 +826,45 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2 - -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2 - -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2 - -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2 - -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2 - -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2 - -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2 - -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2 +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c @@ -1022,56 +1031,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -2082,6 +2091,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_highbd_convolve8 = vpx_highbd_convolve8_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2; + vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2; + vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2; + vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2; + vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2; + vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2; + vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2; + vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3; vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; diff --git a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h index 4a32a38e064..0e14191aaec 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -80,13 +81,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -104,7 +105,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -122,10 +123,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c void vp9_rtcd(void); diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h index 8d126acaa9f..a09ed559657 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h @@ -652,28 +652,28 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8 vpx_highbd_convolve8_c -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); #define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); @@ -832,49 +832,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); diff --git a/chromium/third_party/libvpx/source/config/vpx_version.h b/chromium/third_party/libvpx/source/config/vpx_version.h index ebb12f2b240..e8dde9d3ea8 100644 --- a/chromium/third_party/libvpx/source/config/vpx_version.h +++ b/chromium/third_party/libvpx/source/config/vpx_version.h @@ -1,7 +1,7 @@ #define VERSION_MAJOR 1 #define VERSION_MINOR 6 #define VERSION_PATCH 1 -#define VERSION_EXTRA "446-gf22b828d6" +#define VERSION_EXTRA "657-gb3bf91bdc" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1-446-gf22b828d6" -#define VERSION_STRING " v1.6.1-446-gf22b828d6" +#define VERSION_STRING_NOSP "v1.6.1-657-gb3bf91bdc" +#define VERSION_STRING " v1.6.1-657-gb3bf91bdc" diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h index 6b04a45895d..c178d191672 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -141,13 +142,13 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); void vp9_rtcd(void); @@ -198,7 +199,7 @@ static void setup_rtcd_internal(void) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; - if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1; } #endif diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h index 2ebbf6e3fa3..49450cda3db 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h @@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -824,31 +825,39 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8 vpx_highbd_convolve8_c +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c @@ -1015,56 +1024,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd); -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -2030,6 +2039,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; @@ -2360,10 +2371,24 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_highbd_8_variance8x16 = vpx_highbd_8_variance8x16_sse2; vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_c; if (flags & HAS_SSE2) vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_sse2; + vpx_highbd_convolve8 = vpx_highbd_convolve8_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2; + vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2; + vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2; + vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2; + vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2; + vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_c; + if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2; vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_c; if (flags & HAS_SSE2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2; vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_c; if (flags & HAS_SSE2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2; vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_c; if (flags & HAS_SSE2) vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_sse2; vpx_highbd_dc_predictor_32x32 = vpx_highbd_dc_predictor_32x32_c; diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h index 58a2d4e7268..56d5840ce95 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h @@ -14,6 +14,7 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c -void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); @@ -143,13 +144,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count); void vp9_rtcd(void); @@ -176,6 +177,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; + vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; + if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1; } #endif diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h index 889fca7c45a..b2403c36bc4 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h @@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_sse2 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -825,37 +826,45 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p); void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride); #define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2 - -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2 - -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2 - -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2 - -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2 - -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2 - -void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2 - -void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); -#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2 +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); +RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps); void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c @@ -1022,56 +1031,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -2082,6 +2091,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_highbd_convolve8 = vpx_highbd_convolve8_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2; + vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2; + vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2; + vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2; + vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2; + vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2; + vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2; + vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2; + if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3; vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile index 0d29609ff8c..90522e5f63a 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile +++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile @@ -141,8 +141,8 @@ $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 # POWER -$(BUILD_PFX)%_vsx.c.d: CFLAGS += -mvsx -$(BUILD_PFX)%_vsx.c.o: CFLAGS += -mvsx +$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx +$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh index dcfdfe1d2ba..fbe8b1b4580 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh +++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh @@ -674,7 +674,6 @@ check_xcode_minimum_version() { process_common_toolchain() { if [ -z "$toolchain" ]; then gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}" - # detect tgt_isa case "$gcctarget" in aarch64*) @@ -697,6 +696,9 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; + power*64*-*) + tgt_isa=ppc64 + ;; power*) tgt_isa=ppc ;; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c index 64d177581ed..b571d29d9a4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c @@ -728,6 +728,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) { SPEED_FEATURES *sf = &cpi->sf; int Mode = cpi->compressor_speed; int Speed = cpi->Speed; + int Speed2; int i; VP8_COMMON *cm = &cpi->common; int last_improved_quant = sf->improved_quant; @@ -829,9 +830,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) { cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] = cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed, mode_check_freq_map_vhbpred); - cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1); + + // For real-time mode at speed 10 keep the mode_check_freq threshold + // for NEW1 similar to that of speed 9. + Speed2 = Speed; + if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9); + cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1); + cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] = speed_map(Speed, mode_check_freq_map_new2); + cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed, mode_check_freq_map_split1); cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] = diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c index 55957414cde..69069042cc2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c @@ -205,7 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 @@ -213,7 +213,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 }; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i, j; tran_low_t out[4 * 4]; @@ -245,14 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = { { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 }; -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Inverse transform row vectors. for (i = 0; i < 8; ++i) { @@ -279,14 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = { { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 }; -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 16; ++i) { @@ -307,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } // idct -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_idct4x4_16_add(input, dest, stride, bd); @@ -315,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_idct4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); @@ -323,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -340,7 +337,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, } } -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to separate different cases. @@ -356,7 +353,7 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, } } -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // Non-zero coeff only in upper-left 8x8 if (eob == 1) { @@ -372,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, // iht void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); else @@ -380,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); } else { @@ -389,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); } else { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h index ea958a38c0e..3e83b8402de 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h @@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c index 8eb71268986..a108a65153b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c @@ -21,7 +21,7 @@ #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd) { @@ -190,7 +190,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h index 4fed4f7f6ec..1b09b380d41 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h @@ -33,7 +33,7 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( @@ -68,7 +68,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl index 10c779c01d3..baf63e97fa9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -7,6 +7,7 @@ print <<EOF #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" struct macroblockd; @@ -101,11 +102,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; } # @@ -120,7 +121,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude"; - specialize qw/vp9_denoiser_filter sse2/; + specialize qw/vp9_denoiser_filter neon sse2/; } if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -197,8 +198,8 @@ $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8; add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx/; -add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse2 msa/; +add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; +specialize qw/vp9_temporal_filter_apply sse4_1/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -217,7 +218,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; + add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; } # End vp9_high encoder functions @@ -225,7 +226,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # frame based scale # -add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst"; +add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler"; specialize qw/vp9_scale_and_extend_frame ssse3/; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index dcfc454aa0d..bb2dcf52bf5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -16,7 +16,6 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; - const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); in[0] = load_input_data(input); @@ -49,31 +48,7 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, in[0]); - d2 = _mm_add_epi16(d2, in[1]); - d0 = _mm_packus_epi16(d0, d2); - // store result[0] - *(int *)dest = _mm_cvtsi128_si32(d0); - // store result[1] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store result[2] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - // store result[3] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - } + recon_and_store4x4_sse2(in, dest, stride); } void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c index f71f7d1eb41..0760f8c2398 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c @@ -189,21 +189,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -256,21 +257,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -451,24 +453,19 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, const struct scale_factors *sf, MACROBLOCKD *xd, int w, int h, int ref, int xs, int ys) { DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); - const uint8_t *buf_ptr; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0, b_w, b_h, frame_width, frame_height); - buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset; + highbd_inter_predictor(mc_buf_high + border_offset, b_w, + CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0, y0, b_w, b_h, frame_width, frame_height); - buf_ptr = ((uint8_t *)mc_buf_high) + border_offset; - } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, - subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); - } else { - inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf, - w, h, ref, kernel, xs, ys); + inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst, + dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel, + xs, ys); } } #else @@ -631,7 +628,8 @@ static void dec_build_inter_predictors( } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c new file mode 100644 index 00000000000..4152e7bb5d5 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[4][4]; + int r, c, sum_diff = 0; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 98c56407596..0b175969be6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -22,6 +22,7 @@ #include "vp9/encoder/vp9_rd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/vpx_dsp_common.h" void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c deleted file mode 100644 index 1ab5f36cc59..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -static void temporal_filter_apply_8size_msa(const uint8_t *frm1_ptr, - uint32_t stride, - const uint8_t *frm2_ptr, - int32_t filt_sth, int32_t filt_wgt, - uint32_t *acc, uint16_t *cnt) { - uint32_t row; - uint64_t f0, f1, f2, f3; - v16i8 frm2, frm1 = { 0 }; - v16i8 frm4, frm3 = { 0 }; - v16u8 frm_r, frm_l; - v8i16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 2; row--;) { - LD4(frm1_ptr, stride, f0, f1, f2, f3); - frm1_ptr += (4 * stride); - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 32; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - INSERT_D2_SB(f0, f1, frm1); - INSERT_D2_SB(f2, f3, frm3); - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - UNPCK_UB_SH(frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - UNPCK_UB_SH(frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - } -} - -static void temporal_filter_apply_16size_msa(const uint8_t *frm1_ptr, - uint32_t stride, - const uint8_t *frm2_ptr, - int32_t filt_sth, int32_t filt_wgt, - uint32_t *acc, uint16_t *cnt) { - uint32_t row; - v16i8 frm1, frm2, frm3, frm4; - v16u8 frm_r, frm_l; - v16i8 zero = { 0 }; - v8u16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 8; row--;) { - LD_SB2(frm1_ptr, stride, frm1, frm3); - frm1_ptr += stride; - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 16; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - frm1_ptr += stride; - frm2_ptr += 16; - } -} - -void vp9_temporal_filter_apply_msa(const uint8_t *frame1_ptr, uint32_t stride, - const uint8_t *frame2_ptr, uint32_t blk_w, - uint32_t blk_h, int32_t strength, - int32_t filt_wgt, uint32_t *accu, - uint16_t *cnt) { - if (8 == (blk_w * blk_h)) { - temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else if (16 == (blk_w * blk_h)) { - temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else { - vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, - strength, filt_wgt, accu, cnt); - } -} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index b4a0bbe58bd..048ea629f5a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -277,8 +277,6 @@ void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) { !cpi->oxcf.gf_cbr_boost_pct) { // Force this frame as a golden update frame if this frame changes the // resolution (resize_pending != 0). - // TODO(marpan): check on forcing golden update if the background has very - // high motion in current frame. if (cpi->resize_pending != 0) { vp9_cyclic_refresh_set_golden_update(cpi); rc->frames_till_gf_update_due = rc->baseline_gf_interval; @@ -316,6 +314,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { else rc->baseline_gf_interval = 40; if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20; + if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40) + rc->baseline_gf_interval = 10; } // Update the segmentation map, and related quantities: cyclic refresh map, @@ -425,6 +425,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { int target_refresh = 0; double weight_segment_target = 0; double weight_segment = 0; + cr->apply_cyclic_refresh = 1; + if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || + (!cpi->use_svc && rc->avg_frame_low_motion < 55 && + rc->frames_since_key > 40)) { + cr->apply_cyclic_refresh = 0; + return; + } cr->percent_refresh = 10; if (cr->reduce_refresh) cr->percent_refresh = 5; cr->max_qdelta_perc = 60; @@ -493,14 +500,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; - // TODO(marpan): Look into whether we should reduce the amount/delta-qp - // instead of completely shutting off at low bitrates. For now keep it on. - // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); - const int apply_cyclic_refresh = 1; if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; - // Don't apply refresh on key frame or temporal enhancement layer frames. - if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) || - (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) { + if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) { // Set segmentation map to 0 and disable. unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h index 9de5074d9ec..77fa67c9e16 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -67,6 +67,7 @@ struct CYCLIC_REFRESH { int qindex_delta[3]; int reduce_refresh; double weight_segment; + int apply_cyclic_refresh; }; struct VP9_COMP; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h index 42dc6830d6c..ab488f48f0a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h @@ -93,11 +93,6 @@ struct macroblock { int rddiv; int rdmult; int mb_energy; - int *m_search_count_ptr; - int *ex_search_count_ptr; -#if CONFIG_MULTITHREAD - pthread_mutex_t *search_count_mutex; -#endif // These are set to their default values at the beginning, and then adjusted // further in the encoding process. @@ -173,6 +168,8 @@ struct macroblock { uint8_t skip_low_source_sad; + uint8_t lowvar_highsumdiff; + uint8_t last_sb_high_content; // For each superblock: saves the content value (e.g., low/high sad/sumdiff) @@ -187,7 +184,7 @@ struct macroblock { void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH - void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, + void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); #endif }; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c index b92557a9c40..e6933f00d8b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c @@ -191,7 +191,9 @@ static VP9_DENOISER_DECISION perform_motion_compensation( int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, int num_spatial_layers, int width) { - int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; + const int sse_diff = (ctx->newmv_sse == UINT_MAX) + ? 0 + : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); MV_REFERENCE_FRAME frame; MACROBLOCKD *filter_mbd = &mb->e_mbd; MODE_INFO *mi = filter_mbd->mi[0]; @@ -217,7 +219,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation( // difference in sum-squared-error, use it. if (frame != INTRA_FRAME && (frame != GOLDEN_FRAME || num_spatial_layers == 1) && - ctx->newmv_sse != UINT_MAX && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; mi->mode = ctx->best_sse_inter_mode; @@ -571,20 +572,26 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) { // Scale/increase the partition threshold for denoiser speed-up. int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, - int content_state) { + int content_state, int temporal_layer_id) { if ((content_state == kLowSadLowSumdiff) || - (content_state == kHighSadLowSumdiff) || noise_level == kDenHigh) - return (3 * threshold) >> 1; - else + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) || + (temporal_layer_id != 0)) { + int64_t scaled_thr = + (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; + return scaled_thr; + } else { return (5 * threshold) >> 2; + } } // Scale/increase the ac skip threshold for denoiser speed-up. int64_t vp9_scale_acskip_thresh(int64_t threshold, - VP9_DENOISER_LEVEL noise_level, - int abs_sumdiff) { + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id) { if (noise_level >= kDenLow && abs_sumdiff < 5) - return threshold *= (noise_level == kDenLow) ? 2 : 6; + return threshold *= + (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6; else return threshold; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h index 9bded21769d..f0845e113c0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h @@ -95,11 +95,11 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser); void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level); int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, - int content_state); + int content_state, int temporal_layer_id); int64_t vp9_scale_acskip_thresh(int64_t threshold, - VP9_DENOISER_LEVEL noise_level, - int abs_sumdiff); + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id); #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c index 481f5a0fdac..6215e198ca6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c @@ -495,11 +495,13 @@ int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width, if (width <= 640 && height <= 480) return (5 * threshold_base) >> 2; else if ((content_state == kLowSadLowSumdiff) || - (content_state == kHighSadLowSumdiff)) + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) return (5 * threshold_base) >> 2; } else if (speed == 7) { if ((content_state == kLowSadLowSumdiff) || - (content_state == kHighSadLowSumdiff)) { + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) { return (5 * threshold_base) >> 2; } } @@ -536,10 +538,11 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, threshold_base = (7 * threshold_base) >> 3; } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && cpi->oxcf.speed > 5 && - cpi->denoiser.denoising_level >= kDenLow) - threshold_base = vp9_scale_part_thresh( - threshold_base, cpi->denoiser.denoising_level, content_state); + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow) + threshold_base = + vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level, + content_state, cpi->svc.temporal_layer_id); else threshold_base = scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width, @@ -838,7 +841,8 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, } } -static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, +static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, + MACROBLOCKD *xd, BLOCK_SIZE bsize, int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; BLOCK_SIZE *prev_part = cpi->prev_partition; @@ -848,49 +852,61 @@ static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, const int bs = (1 << bsl) / 4; BLOCK_SIZE subsize; PARTITION_TYPE partition; - MODE_INFO *mi = NULL; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; partition = partition_lookup[bsl][prev_part[start_pos]]; subsize = get_subsize(bsize, partition); - mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; if (subsize < BLOCK_8X8) { - mi->sb_type = bsize; + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); } else { switch (partition) { - case PARTITION_NONE: mi->sb_type = bsize; break; + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + break; case PARTITION_HORZ: - mi->sb_type = subsize; - if (mi_row + bs < cm->mi_rows) - cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type = - subsize; + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize); break; case PARTITION_VERT: - mi->sb_type = subsize; - if (mi_col + bs < cm->mi_cols) - cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type = - subsize; + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); break; case PARTITION_SPLIT: - copy_partitioning_helper(cpi, subsize, mi_row, mi_col); - copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col); - copy_partitioning_helper(cpi, subsize, mi_row, mi_col + bs); - copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col + bs); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); break; default: assert(0); } } } -static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, - int mi_col, int segment_id, int sb_offset) { - if (cpi->rc.frames_since_key > 1 && segment_id == CR_SEGMENT_ID_BASE && +static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + int mi_row, int mi_col, int segment_id, + int sb_offset) { + int svc_copy_allowed = 1; + int frames_since_key_thresh = 1; + if (cpi->use_svc) { + // For SVC, don't allow copy if base spatial layer is key frame, or if + // frame is not a temporal enhancement layer frame. + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame || + (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 && + cpi->svc.number_temporal_layers > 1)) + svc_copy_allowed = 0; + frames_since_key_thresh = cpi->svc.number_spatial_layers << 1; + } + if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed && + !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE && cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE && cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) { if (cpi->prev_partition != NULL) { - copy_partitioning_helper(cpi, BLOCK_64X64, mi_row, mi_col); + copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col); cpi->copied_frame_cnt[sb_offset] += 1; memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]), sizeof(x->variance_low)); @@ -946,9 +962,16 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, unsigned int y_sad, int is_key_frame) { int i; MACROBLOCKD *xd = &x->e_mbd; + + if (is_key_frame) return; + // For speed >= 8, avoid the chroma check if y_sad is above threshold. - if (is_key_frame || (cpi->oxcf.speed >= 8 && y_sad > cpi->vbp_thresholds[1])) - return; + if (cpi->oxcf.speed >= 8) { + if (y_sad > cpi->vbp_thresholds[1] && + (!cpi->noise_estimate.enabled || + vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium)) + return; + } for (i = 1; i <= 2; ++i) { unsigned int uv_sad = UINT_MAX; @@ -994,6 +1017,11 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, else x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff : kHighSadHighSumdiff; + + // Detect large lighting change. + if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000) + x->content_state_sb = kLowVarHighSumdiff; + if (cpi->content_state_sb_fd != NULL) { if (tmp_sad < avg_source_sad_threshold2) { // Cap the increment to 255. @@ -1061,11 +1089,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, content_state == kLowSadHighSumdiff) ? 1 : 0; + x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0; if (cpi->content_state_sb_fd != NULL) x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; // If source_sad is low copy the partition without computing the y_sad. if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && - copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) { + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { return 0; } } @@ -1192,7 +1221,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // Stop the copy every cpi->max_copied_frame to refresh the partition. // TODO(jianj) : tune the threshold. if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy && - copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) { + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { chroma_check(cpi, x, bsize, y_sad, is_key_frame); return 0; } @@ -4110,6 +4139,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, x->color_sensitivity[1] = 0; x->sb_is_skin = 0; x->skip_low_source_sad = 0; + x->lowvar_highsumdiff = 0; x->content_state_sb = 0; if (seg->enabled) { @@ -4341,8 +4371,6 @@ void vp9_init_tile_data(VP9_COMP *cpi) { } } #if CONFIG_MULTITHREAD - tile_data->search_count_mutex = NULL; - tile_data->enc_row_mt_mutex = NULL; tile_data->row_base_thresh_freq_fact = NULL; #endif } @@ -4361,10 +4389,6 @@ void vp9_init_tile_data(VP9_COMP *cpi) { cpi->tplist[tile_row][tile_col] = tplist + tplist_count; tplist = cpi->tplist[tile_row][tile_col]; tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); - - // Set up pointers to per thread motion search counters. - this_tile->m_search_count = 0; // Count of motion search hits. - this_tile->ex_search_count = 0; // Exhaustive mesh search hits. } } } @@ -4409,13 +4433,6 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row, const int mi_row_end = tile_info->mi_row_end; int mi_row; - // Set up pointers to per thread motion search counters. - td->mb.m_search_count_ptr = &this_tile->m_search_count; - td->mb.ex_search_count_ptr = &this_tile->ex_search_count; -#if CONFIG_MULTITHREAD - td->mb.search_count_mutex = this_tile->search_count_mutex; -#endif - for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c index 0940d9a6153..7e30499c573 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c @@ -637,24 +637,25 @@ static void encode_block(int plane, int block, int row, int col, if (x->skip_encode || p->eobs[block] == 0) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_4X4: // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; default: assert(0 && "Invalid transform size"); @@ -699,7 +700,8 @@ static void encode_block_pass1(int plane, int block, int row, int col, if (p->eobs[block] > 0) { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd); + x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -799,6 +801,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { @@ -810,8 +813,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { - vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; case TX_16X16: @@ -827,8 +833,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); } + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; @@ -845,8 +854,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); } + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; @@ -863,15 +875,18 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) { // this is like vp9_short_idct4x4 but has a special case around // eob<=1 which is significant (not just an optimization) for the // lossless case. - x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } else { - vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd); + vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, + xd->bd); } } break; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c index 1dc70d2d361..f57f40dbe4c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c @@ -73,6 +73,9 @@ // chosen. // #define OUTPUT_YUV_REC +#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold +#define FRAME_RATE_FACTOR 8 + #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file = NULL; #endif @@ -100,6 +103,331 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif +// compute adaptive threshold for skip recoding +static int compute_context_model_thresh(const VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_size = (cm->width * cm->height) >> 10; + const int bitrate = (int)(oxcf->target_bandwidth >> 10); + const int qindex_factor = cm->base_qindex + (MAXQ >> 1); + + // This equation makes the threshold adaptive to frame size. + // Coding gain obtained by recoding comes from alternate frames of large + // content change. We skip recoding if the difference of previous and current + // frame context probability model is less than a certain threshold. + // The first component is the most critical part to guarantee adaptivity. + // Other parameters are estimated based on normal setting of hd resolution + // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 + const int thresh = + ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) * + qindex_factor) >> + 9; + + return thresh; +} + +// compute the total cost difference between current +// and previous frame context prob model. +static int compute_context_model_diff(const VP9_COMMON *const cm) { + const FRAME_CONTEXT *const pre_fc = + &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_CONTEXT *const cur_fc = cm->fc; + const FRAME_COUNTS *counts = &cm->counts; + vpx_prob pre_last_prob, cur_last_prob; + int diff = 0; + int i, j, k, l, m, n; + + // y_mode_prob + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->y_mode[i][j] * + (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->y_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // uv_mode_prob + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->uv_mode[i][j] * + (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->uv_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // partition_prob + for (i = 0; i < PARTITION_CONTEXTS; ++i) { + for (j = 0; j < PARTITION_TYPES - 1; ++j) { + diff += (int)counts->partition[i][j] * + (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2]; + cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2]; + + diff += (int)counts->partition[i][PARTITION_TYPES - 1] * + (pre_last_prob - cur_last_prob); + } + + // coef_probs + for (i = 0; i < TX_SIZES; ++i) { + for (j = 0; j < PLANE_TYPES; ++j) { + for (k = 0; k < REF_TYPES; ++k) { + for (l = 0; l < COEF_BANDS; ++l) { + for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) { + for (n = 0; n < UNCONSTRAINED_NODES; ++n) { + diff += (int)counts->coef[i][j][k][l][m][n] * + (pre_fc->coef_probs[i][j][k][l][m][n] - + cur_fc->coef_probs[i][j][k][l][m][n]); + } + + pre_last_prob = + MAX_PROB - + pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + cur_last_prob = + MAX_PROB - + cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + + diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] * + (pre_last_prob - cur_last_prob); + } + } + } + } + } + + // switchable_interp_prob + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) { + diff += (int)counts->switchable_interp[i][j] * + (pre_fc->switchable_interp_prob[i][j] - + cur_fc->switchable_interp_prob[i][j]); + } + pre_last_prob = + MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + cur_last_prob = + MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + + diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] * + (pre_last_prob - cur_last_prob); + } + + // inter_mode_probs + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + for (j = 0; j < INTER_MODES - 1; ++j) { + diff += (int)counts->inter_mode[i][j] * + (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2]; + + diff += (int)counts->inter_mode[i][INTER_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // intra_inter_prob + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + diff += (int)counts->intra_inter[i][0] * + (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i]; + + diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // comp_inter_prob + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + diff += (int)counts->comp_inter[i][0] * + (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i]; + + diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // single_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < 2; ++j) { + diff += (int)counts->single_ref[i][j][0] * + (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]); + + pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j]; + cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j]; + + diff += + (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob); + } + } + + // comp_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + diff += (int)counts->comp_ref[i][0] * + (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i]; + + diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob); + } + + // tx_probs + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + // p32x32 + for (j = 0; j < TX_SIZES - 1; ++j) { + diff += (int)counts->tx.p32x32[i][j] * + (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + + diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] * + (pre_last_prob - cur_last_prob); + + // p16x16 + for (j = 0; j < TX_SIZES - 2; ++j) { + diff += (int)counts->tx.p16x16[i][j] * + (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + + diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] * + (pre_last_prob - cur_last_prob); + + // p8x8 + for (j = 0; j < TX_SIZES - 3; ++j) { + diff += (int)counts->tx.p8x8[i][j] * + (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + + diff += + (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob); + } + + // skip_probs + for (i = 0; i < SKIP_CONTEXTS; ++i) { + diff += (int)counts->skip[i][0] * + (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]); + + pre_last_prob = MAX_PROB - pre_fc->skip_probs[i]; + cur_last_prob = MAX_PROB - cur_fc->skip_probs[i]; + + diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob); + } + + // mv + for (i = 0; i < MV_JOINTS - 1; ++i) { + diff += (int)counts->mv.joints[i] * + (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]); + } + pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2]; + cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2]; + + diff += + (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob); + + for (i = 0; i < 2; ++i) { + const nmv_component_counts *nmv_count = &counts->mv.comps[i]; + const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i]; + const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i]; + + // sign + diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign); + + pre_last_prob = MAX_PROB - pre_nmv_prob->sign; + cur_last_prob = MAX_PROB - cur_nmv_prob->sign; + + diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob); + + // classes + for (j = 0; j < MV_CLASSES - 1; ++j) { + diff += (int)nmv_count->classes[j] * + (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2]; + + diff += (int)nmv_count->classes[MV_CLASSES - 1] * + (pre_last_prob - cur_last_prob); + + // class0 + for (j = 0; j < CLASS0_SIZE - 1; ++j) { + diff += (int)nmv_count->class0[j] * + (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2]; + + diff += (int)nmv_count->class0[CLASS0_SIZE - 1] * + (pre_last_prob - cur_last_prob); + + // bits + for (j = 0; j < MV_OFFSET_BITS; ++j) { + diff += (int)nmv_count->bits[j][0] * + (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]); + + pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j]; + cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j]; + + diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob); + } + + // class0_fp + for (j = 0; j < CLASS0_SIZE; ++j) { + for (k = 0; k < MV_FP_SIZE - 1; ++k) { + diff += (int)nmv_count->class0_fp[j][k] * + (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + + diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] * + (pre_last_prob - cur_last_prob); + } + + // fp + for (j = 0; j < MV_FP_SIZE - 1; ++j) { + diff += + (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2]; + + diff += + (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob); + + // class0_hp + diff += (int)nmv_count->class0_hp[0] * + (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp; + + diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob); + + // hp + diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->hp; + + diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob); + } + + return -diff; +} + // Test for whether to calculate metrics for the frame. static int is_psnr_calc_enabled(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -110,22 +438,22 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) { /* clang-format off */ const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { - { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 }, - { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 }, - { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 }, - { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 }, - { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 }, - { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 }, - { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 }, - { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 }, - { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 }, - { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 }, + { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 }, + { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 }, + { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 }, + { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 }, + { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 }, + { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 }, + { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 }, + { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 }, + { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 }, + { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 }, // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when - // they are finalized (currently TBD). - { LEVEL_5_2, 1176502272, 8912896, 180000, 0, 8, 8, 10, 4 }, - { LEVEL_6, 1176502272, 35651584, 180000, 0, 8, 16, 10, 4 }, - { LEVEL_6_1, 2353004544u, 35651584, 240000, 0, 8, 16, 10, 4 }, - { LEVEL_6_2, 4706009088u, 35651584, 480000, 0, 8, 16, 10, 4 }, + // they are finalized (currently tentative). + { LEVEL_5_2, 1176502272, 8912896, 180000, 90000, 8, 8, 10, 4 }, + { LEVEL_6, 1176502272, 35651584, 180000, 90000, 8, 16, 10, 4 }, + { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 }, + { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 }, }; /* clang-format on */ @@ -2390,7 +2718,9 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, #if CONFIG_VP9_HIGHBITDEPTH static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int bd) { + YV12_BUFFER_CONFIG *dst, int bd, + INTERP_FILTER filter_type, + int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; @@ -2400,7 +2730,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -2408,16 +2738,17 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_stride = src_strides[i]; const int dst_stride = dst_strides[i]; for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h; + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h * src_stride + (x / factor) * src_w / dst_w; uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, 16 / factor, bd); @@ -2618,6 +2949,10 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { MACROBLOCKD *xd = &cpi->td.mb.e_mbd; struct loopfilter *lf = &cm->lf; + const int is_reference_frame = + (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + if (xd->lossless) { lf->filter_level = 0; lf->last_filt_level = 0; @@ -2643,7 +2978,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); } - if (lf->filter_level > 0) { + if (lf->filter_level > 0 && is_reference_frame) { vp9_build_mask_frame(cm, lf->filter_level, 0); if (cpi->num_workers > 1) @@ -2708,7 +3043,8 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth); + scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, + EIGHTTAP, 0); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -2731,7 +3067,7 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf); + vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -3118,6 +3454,15 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + const INTERP_FILTER filter_scaler = + (is_one_pass_cbr_svc(cpi)) + ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] + : EIGHTTAP; + const int phase_scaler = + (is_one_pass_cbr_svc(cpi)) + ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] + : 0; + // Flag to check if its valid to compute the source sad (used for // scene detection and for superblock content state in CBR mode). // The flag may get reset below based on SVC or resizing state. @@ -3136,8 +3481,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 // result will be saved in scaled_temp and might be used later. + const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1]; + const int phase_scaler2 = cpi->svc.downsample_filter_phase[1]; cpi->Source = vp9_svc_twostage_scale( - cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp); + cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp, + filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); cpi->svc.scaled_one_half = 1; } else if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && @@ -3149,16 +3497,17 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->svc.scaled_one_half = 0; } else { cpi->Source = vp9_scale_if_required( - cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0)); + cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), + filter_scaler, phase_scaler); } // Unfiltered raw source used in metrics calculation if the source // has been filtered. if (is_psnr_calc_enabled(cpi)) { #ifdef ENABLE_KF_DENOISE if (is_spatial_denoise_enabled(cpi)) { - cpi->raw_source_frame = - vp9_scale_if_required(cm, &cpi->raw_unscaled_source, - &cpi->raw_scaled_source, (cpi->oxcf.pass == 0)); + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler); } else { cpi->raw_source_frame = cpi->Source; } @@ -3190,9 +3539,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION || (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) || cpi->compute_source_sad_onepass)) - cpi->Last_Source = - vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source, (cpi->oxcf.pass == 0)); + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); if (cpi->Last_Source == NULL || cpi->Last_Source->y_width != cpi->Source->y_width || @@ -3214,10 +3563,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->oxcf.content == VP9E_CONTENT_SCREEN)) vp9_scene_detection_onepass(cpi); - // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference - // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this - // frame-level upsampling. - if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) { + // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame + // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can + // avoid this frame-level upsampling (for non intra_only frames). + if (frame_is_intra_only(cm) == 0 && + !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) { vp9_scale_references(cpi); } @@ -3374,8 +3724,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, &frame_over_shoot_limit); } - cpi->Source = vp9_scale_if_required( - cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0)); + cpi->Source = + vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); // Unfiltered raw source used in metrics calculation if the source // has been filtered. @@ -3384,7 +3735,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) { cpi->raw_source_frame = vp9_scale_if_required( cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, - (cpi->oxcf.pass == 0)); + (cpi->oxcf.pass == 0), EIGHTTAP, 0); } else { cpi->raw_source_frame = cpi->Source; } @@ -3394,9 +3745,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, } if (cpi->unscaled_last_source != NULL) - cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source, - (cpi->oxcf.pass == 0)); + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); if (frame_is_intra_only(cm) == 0) { if (loop_count > 0) { @@ -3625,6 +3976,15 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, #endif if (enable_acl) { + // Skip recoding, if model diff is below threshold + const int thresh = compute_context_model_thresh(cpi); + const int diff = compute_context_model_diff(cm); + if (diff < thresh) { + vpx_clear_system_state(); + restore_coding_context(cpi); + return; + } + vp9_encode_frame(cpi); vpx_clear_system_state(); restore_coding_context(cpi); @@ -3674,23 +4034,28 @@ static void set_ext_overrides(VP9_COMP *cpi) { } } -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp) { +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->bit_depth == VPX_BITS_8) { - vp9_scale_and_extend_frame(unscaled, scaled_temp); - vp9_scale_and_extend_frame(scaled_temp, scaled); + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); } else { - scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth); - scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth); + scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, + filter_type2, phase_scaler2); + scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); } #else - vp9_scale_and_extend_frame(unscaled, scaled_temp); - vp9_scale_and_extend_frame(scaled_temp, scaled); + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); #endif // CONFIG_VP9_HIGHBITDEPTH return scaled; } else { @@ -3698,25 +4063,25 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, } } -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler) { +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) if (cm->bit_depth == VPX_BITS_8) - vp9_scale_and_extend_frame(unscaled, scaled); + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else - scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth); + scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); else scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); #else if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) - vp9_scale_and_extend_frame(unscaled, scaled); + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else scale_and_extend_frame_nonnormative(unscaled, scaled); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -4049,12 +4414,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, ++cm->current_video_frame; cpi->ext_refresh_frame_flags_pending = 0; cpi->svc.rc_drop_superframe = 1; + cpi->last_frame_dropped = 1; // TODO(marpan): Advancing the svc counters on dropped frames can break // the referencing scheme for the fixed svc patterns defined in // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but // for now, don't advance the svc frame counters on dropped frame. // if (cpi->use_svc) // vp9_inc_frame_in_layer(cpi); + return; } } @@ -4072,6 +4439,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, encode_with_recode_loop(cpi, size, dest); } + cpi->last_frame_dropped = 0; + // Disable segmentation if it decrease rate/distortion ratio if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) vp9_try_disable_lookahead_aq(cpi, size, dest); @@ -5261,4 +5630,9 @@ void vp9_set_row_mt(VP9_COMP *cpi) { if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) { cpi->row_mt = 1; } + + if (cpi->row_mt && cpi->oxcf.max_threads > 1) + cpi->row_mt_bit_exact = 1; + else + cpi->row_mt_bit_exact = 0; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h index 6c1cb6073e8..672c83bfdf9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h @@ -137,6 +137,7 @@ typedef enum { kLowSadHighSumdiff = 2, kHighSadLowSumdiff = 3, kHighSadHighSumdiff = 4, + kLowVarHighSumdiff = 5, } CONTENT_STATE_SB; typedef struct VP9EncoderConfig { @@ -268,7 +269,6 @@ typedef struct VP9EncoderConfig { VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; int row_mt; - unsigned int row_mt_bit_exact; unsigned int motion_vector_unit_test; } VP9EncoderConfig; @@ -281,17 +281,11 @@ typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; int mode_map[BLOCK_SIZES][MAX_MODES]; - int m_search_count; - int ex_search_count; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; // Used for adaptive_rd_thresh with row multithreading int *row_base_thresh_freq_fact; -#if CONFIG_MULTITHREAD - pthread_mutex_t *search_count_mutex; - pthread_mutex_t *enc_row_mt_mutex; -#endif } TileDataEnc; typedef struct RowMTInfo { @@ -695,7 +689,9 @@ typedef struct VP9_COMP { void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int); void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int); ARNRFilterData arnr_filter_data; + int row_mt; + unsigned int row_mt_bit_exact; // Previous Partition Info BLOCK_SIZE *prev_partition; @@ -708,6 +704,8 @@ typedef struct VP9_COMP { uint8_t *prev_variance_low; uint8_t *copied_frame_cnt; uint8_t max_copied_frame; + // If the last frame is dropped, we don't copy partition. + uint8_t last_frame_dropped; // For each superblock: keeps track of the last time (in frame distance) the // the superblock did not have low source sad. @@ -840,15 +838,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi); void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp); +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2); -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler); +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c index 681e960c8df..51664112a44 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c @@ -552,7 +552,6 @@ static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; int tile_row, tile_col; - TileDataEnc *this_tile; int end_of_frame; int thread_id = thread_data->thread_id; int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; @@ -574,13 +573,6 @@ static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, tile_row = proc_job->tile_row_id; mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE; - this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; - thread_data->td->mb.m_search_count_ptr = &this_tile->m_search_count; - thread_data->td->mb.ex_search_count_ptr = &this_tile->ex_search_count; -#if CONFIG_MULTITHREAD - thread_data->td->mb.search_count_mutex = this_tile->search_count_mutex; -#endif - vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c index 222e27a9f26..b6e3275482c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c @@ -42,15 +42,12 @@ #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 -#define FACTOR_PT_LOW 0.70 -#define FACTOR_PT_HIGH 0.90 #define FIRST_PASS_Q 10.0 #define GF_MAX_BOOST 96.0 #define INTRA_MODE_PENALTY 1024 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 #define NEW_MV_MODE_PENALTY 32 -#define SVC_FACTOR_PT_LOW 0.45 #define DARK_THRESH 64 #define DEFAULT_GRP_WEIGHT 1.0 #define RC_FACTOR_MIN 0.75 @@ -241,14 +238,14 @@ static double calculate_active_area(const VP9_COMP *cpi, // Calculate a modified Error used in distributing bits between easier and // harder frames. #define ACT_AREA_CORRECTION 0.5 -static double calculate_modified_err(const VP9_COMP *cpi, - const TWO_PASS *twopass, - const VP9EncoderConfig *oxcf, - const FIRSTPASS_STATS *this_frame) { +static double calculate_mod_frame_score(const VP9_COMP *cpi, + const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { const FIRSTPASS_STATS *const stats = &twopass->total_stats; const double av_weight = stats->weight / stats->count; const double av_err = (stats->coded_error * av_weight) / stats->count; - double modified_error = + double modified_score = av_err * pow(this_frame->coded_error * this_frame->weight / DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0); @@ -258,11 +255,38 @@ static double calculate_modified_err(const VP9_COMP *cpi, // remaining active MBs. The correction here assumes that coding // 0.5N blocks of complexity 2X is a little easier than coding N // blocks of complexity X. - modified_error *= + modified_score *= pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); - return fclamp(modified_error, twopass->modified_error_min, - twopass->modified_error_max); + return modified_score; +} +static double calculate_norm_frame_score(const VP9_COMP *cpi, + const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *const stats = &twopass->total_stats; + const double av_weight = stats->weight / stats->count; + const double av_err = (stats->coded_error * av_weight) / stats->count; + double modified_score = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0; + const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0; + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_score *= + pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); + + // Normalize to a midpoint score. + modified_score /= DOUBLE_DIVIDE_CHECK(twopass->mean_mod_score); + + return fclamp(modified_score, min_score, max_score); } // This function returns the maximum target rate per frame. @@ -710,9 +734,14 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, fps->frame = cm->current_video_frame; fps->spatial_layer_id = cpi->svc.spatial_layer_id; - fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err; - fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err; - fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err; + + fps->coded_error = + ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs; + fps->sr_coded_error = + ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs; + fps->intra_error = + ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs; + fps->frame_noise_energy = (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs; fps->count = 1.0; @@ -979,12 +1008,12 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if (log_intra < 10.0) { mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05); fp_acc_data->intra_factor += mb_intra_factor; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = mb_intra_factor; } else { fp_acc_data->intra_factor += 1.0; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0; } @@ -999,12 +1028,12 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample)); fp_acc_data->brightness_factor += mb_brightness_factor; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = mb_brightness_factor; } else { fp_acc_data->brightness_factor += 1.0; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = 1.0; } @@ -1166,7 +1195,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if (((this_error - intrapenalty) * 9 <= motion_error * 10) && (this_error < (2 * intrapenalty))) { fp_acc_data->neutral_count += 1.0; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = 1.0; // Also track cases where the intra is not much worse than the inter @@ -1176,7 +1205,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, mb_neutral_count = (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); fp_acc_data->neutral_count += mb_neutral_count; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = mb_neutral_count; } @@ -1400,7 +1429,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0); + &cpi->scaled_source, 0, EIGHTTAP, 0); } vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -1424,7 +1453,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { cm->log2_tile_rows = 0; - if (cpi->oxcf.row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) + if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) CHECK_MEM_ERROR( cm, cpi->twopass.fp_mb_float_stats, vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); @@ -1441,13 +1470,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } else { cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; - if (cpi->oxcf.row_mt_bit_exact) { + if (cpi->row_mt_bit_exact) { cm->log2_tile_cols = 0; vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs); } vp9_encode_fp_row_mt(cpi); first_tile_col = &cpi->tile_data[0]; - if (cpi->oxcf.row_mt_bit_exact) + if (cpi->row_mt_bit_exact) accumulate_floating_point_stats(cpi, first_tile_col); first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); } @@ -1522,14 +1551,22 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { + 0.65, 0.70, 0.75, 0.85, 0.90, 0.90, 0.90, 1.00, 1.25 +}; + static double calc_correction_factor(double err_per_mb, double err_divisor, - double pt_low, double pt_high, int q, - vpx_bit_depth_t bit_depth) { - const double error_term = err_per_mb / err_divisor; + int q) { + const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor); + const int index = q >> 5; + double power_term; + + assert((index >= 0) && (index < (QINDEX_RANGE >> 5))); - // Adjustment based on actual quantizer to power term. - const double power_term = - VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high); + // Adjustment based on quantizer to the power term. + power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); // Calculate correction factor. if (power_term < 1.0) assert(error_term >= 0.0); @@ -1560,17 +1597,14 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; - const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); - const double av_err_per_mb = section_err / active_mbs; + const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone); + const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct); + const double av_err_per_mb = section_err / active_pct; const double speed_term = 1.0 + 0.04 * oxcf->speed; double last_group_rate_err; const int target_norm_bits_per_mb = (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs); int q; - int is_svc_upper_layer = 0; - - if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) - is_svc_upper_layer = 1; // based on recent history adjust expectations of bits per macroblock. last_group_rate_err = @@ -1583,10 +1617,8 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { - const double factor = calc_correction_factor( - av_err_per_mb, ERR_DIVISOR, - is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW, - FACTOR_PT_HIGH, q, cpi->common.bit_depth); + const double factor = + calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q); const int bits_per_mb = vp9_rc_bits_per_mb( INTER_FRAME, q, factor * speed_term * cpi->twopass.bpm_factor * noise_factor, @@ -1676,22 +1708,35 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; - // Scan the first pass file and calculate a modified total error based upon - // the bias/power function used to allocate bits. + // Scan the first pass file and calculate a modified score for each + // frame that is used to distribute bits. The modified score is assumed + // to provide a linear basis for bit allocation. I.e a frame A with a score + // that is double that of frame B will be allocated 2x as many bits. { - const double avg_error = - stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); const FIRSTPASS_STATS *s = twopass->stats_in; - double modified_error_total = 0.0; - twopass->modified_error_min = - (avg_error * oxcf->two_pass_vbrmin_section) / 100; - twopass->modified_error_max = - (avg_error * oxcf->two_pass_vbrmax_section) / 100; + double modified_score_total = 0.0; + + // The first scan is unclamped and gives a raw average. + while (s < twopass->stats_in_end) { + modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s); + ++s; + } + + // The average error from this first scan is used to define the midpoint + // error for the rate distribution function. + twopass->mean_mod_score = + modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); + + // Second scan using clamps based on the previous cycle average. + // This may modify the total and average somewhat but we dont bother with + // further itterations. + s = twopass->stats_in; + modified_score_total = 0.0; while (s < twopass->stats_in_end) { - modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s); + modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s); ++s; } - twopass->modified_error_left = modified_error_total; + twopass->normalized_score_left = modified_score_total; } // Reset the vbr bits off target counters @@ -1728,9 +1773,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { static double get_sr_decay_rate(const VP9_COMP *cpi, const FIRSTPASS_STATS *frame) { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; + double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; double modified_pct_inter; double modified_pcnt_intra; @@ -1739,7 +1782,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi, (cpi->initial_height + cpi->initial_width)); modified_pct_inter = frame->pcnt_inter; - if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) && + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < (double)NCOUNT_FRAME_II_THRESH)) { modified_pct_inter = @@ -1861,20 +1904,16 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, const double lq = vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5); - int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - - // Correct for any inactive region in the image - num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + const double active_area = calculate_active_area(cpi, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + frame_boost = (BASELINE_ERR_PER_MB * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); // Update the accumulator for second ref error difference. // This is intended to give an indication of how much the coded error is // increasing over time. - *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); *sr_accumulator = VPXMAX(0.0, *sr_accumulator); // Small adjustment for cases where there is a zoom out @@ -1897,20 +1936,16 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, const double lq = vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); - int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - - // Correct for any inactive region in the image - num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + const double active_area = calculate_active_area(cpi, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (KF_BASELINE_ERR_PER_MB * num_mbs) / + frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); // Update the accumulator for second ref error difference. // This is intended to give an indication of how much the coded error is // increasing over time. - *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); *sr_accumulator = VPXMAX(0.0, *sr_accumulator); // Small adjustment for cases where there is a zoom out @@ -2043,7 +2078,7 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, int64_t total_group_bits; // Calculate the bits to be allocated to the group as a whole. - if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) { total_group_bits = (int64_t)(twopass->kf_group_bits * (gf_group_err / twopass->kf_group_error_left)); } else { @@ -2337,7 +2372,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { vp9_zero(next_frame); // Load stats for the current frame. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); // Note the error of the frame at the start of the group. This will be // the GF frame error if we code a normal gf. @@ -2370,8 +2405,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->common.bit_depth)); active_min_gf_interval = rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); - if (active_min_gf_interval > rc->max_gf_interval) - active_min_gf_interval = rc->max_gf_interval; + active_min_gf_interval = + VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf); if (cpi->multi_arf_allowed) { active_max_gf_interval = rc->max_gf_interval; @@ -2382,11 +2417,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // interval to spread the cost of the GF. active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6)); - // We have: active_min_gf_interval <= rc->max_gf_interval - if (active_max_gf_interval < active_min_gf_interval) + // We have: active_min_gf_interval <= + // rc->max_gf_interval + arf_active_or_kf. + if (active_max_gf_interval < active_min_gf_interval) { active_max_gf_interval = active_min_gf_interval; - else if (active_max_gf_interval > rc->max_gf_interval) - active_max_gf_interval = rc->max_gf_interval; + } else { + active_max_gf_interval = VPXMIN(active_max_gf_interval, + rc->max_gf_interval + arf_active_or_kf); + } // Would the active max drop us out just before the near the next kf? if ((active_max_gf_interval <= rc->frames_to_key) && @@ -2400,7 +2438,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ++i; // Accumulate error score of frames in this gf group. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); gf_group_err += mod_frame_err; gf_group_raw_error += this_frame->coded_error; gf_group_noise += this_frame->frame_noise_energy; @@ -2509,7 +2547,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int j; for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { if (EOF == input_stats(twopass, this_frame)) break; - gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + gf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); gf_group_raw_error += this_frame->coded_error; gf_group_noise += this_frame->frame_noise_energy; gf_group_skip_pct += this_frame->intra_skip_pct; @@ -2564,7 +2603,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_bits); // Adjust KF group bits and error remaining. - twopass->kf_group_error_left -= (int64_t)gf_group_err; + twopass->kf_group_error_left -= gf_group_err; // Allocate bits to each of the frames in the GF group. allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); @@ -2614,6 +2653,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #define II_IMPROVEMENT_THRESHOLD 3.5 #define KF_II_MAX 128.0 #define II_FACTOR 12.5 +// Test for very low intra complexity which could cause false key frames +#define V_LOW_INTRA 0.5 + static int test_candidate_kf(TWO_PASS *twopass, const FIRSTPASS_STATS *last_frame, const FIRSTPASS_STATS *this_frame, @@ -2672,7 +2714,7 @@ static int test_candidate_kf(TWO_PASS *twopass, 0.20) && (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 3.0) || - (local_next_frame.intra_error < 200)) { + (local_next_frame.intra_error < V_LOW_INTRA)) { break; } @@ -2748,10 +2790,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->frames_to_key = 1; - twopass->kf_group_bits = 0; // Total bits available to kf group - twopass->kf_group_error_left = 0; // Group modified error score. + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0.0; // Group modified error score. - kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); // Initialize the decay rates for the recent frames to check for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; @@ -2761,7 +2803,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { while (twopass->stats_in < twopass->stats_in_end && rc->frames_to_key < cpi->oxcf.key_freq) { // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); // Load the next frame's stats. last_frame = *this_frame; @@ -2821,7 +2863,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Rescan to get the correct error data for the forced kf group. for (i = 0; i < rc->frames_to_key; ++i) { - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame); input_stats(twopass, &tmp_frame); } rc->next_key_frame_forced = 1; @@ -2838,7 +2881,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int j; for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { if (EOF == input_stats(twopass, this_frame)) break; - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); } rc->frames_to_key = new_frame_to_key; } @@ -2846,11 +2890,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame); } // Calculate the number of bits that should be assigned to the kf group. - if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) { // Maximum number of bits for a single normal frame (not key frame). const int max_bits = frame_max_bits(rc, &cpi->oxcf); @@ -2860,7 +2904,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default allocation based on bits left and relative // complexity of the section. twopass->kf_group_bits = (int64_t)( - twopass->bits_left * (kf_group_err / twopass->modified_error_left)); + twopass->bits_left * (kf_group_err / twopass->normalized_score_left)); // Clip based on maximum per frame rate defined by the user. max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; @@ -2933,12 +2977,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group->rf_level[0] = KF_STD; // Note the total error score of the kf group minus the key frame itself. - twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + twopass->kf_group_error_left = (kf_group_err - kf_mod_err); // Adjust the count of total modified error left. // The count of bits left is adjusted elsewhere based on real coded frame // sizes. - twopass->modified_error_left -= kf_group_err; + twopass->normalized_score_left -= kf_group_err; if (oxcf->resize_mode == RESIZE_DYNAMIC) { // Default to normal-sized frame on keyframes. @@ -3170,16 +3214,10 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { target_rate = gf_group->bit_allocation[gf_group->index]; rc->base_frame_target = target_rate; - { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - // The multiplication by 256 reverses a scaling factor of (>> 8) - // applied when combining MB error values for the frame. - twopass->mb_av_energy = - log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); - twopass->mb_smooth_pct = this_frame.intra_smooth_pct; - } + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; // Update the total stats remaining structure. subtract_stats(&twopass->total_left_stats, &this_frame); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h index d660aa1ffb8..000ecd77926 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h @@ -138,9 +138,8 @@ typedef struct { FIRSTPASS_STATS total_left_stats; int first_pass_done; int64_t bits_left; - double modified_error_min; - double modified_error_max; - double modified_error_left; + double mean_mod_score; + double normalized_score_left; double mb_av_energy; double mb_smooth_pct; @@ -159,7 +158,7 @@ typedef struct { int64_t kf_group_bits; // Error score of frames still to be coded in kf group - int64_t kf_group_error_left; + double kf_group_error_left; double bpm_factor; int rolling_arf_group_target_bits; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c index 349e7bd41d8..e58628388f0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c @@ -16,7 +16,8 @@ #include "vpx_scale/yv12config.h" void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; @@ -26,7 +27,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -34,9 +35,9 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const int src_stride = src_strides[i]; const int dst_stride = dst_strides[i]; for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h; + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h * src_stride + (x / factor) * src_w / dst_w; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c index a3939a5f85d..24e23af3b15 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c @@ -1998,18 +1998,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, int range = sf->mesh_patterns[0].range; int baseline_interval_divisor; -#if CONFIG_MULTITHREAD - if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex); -#endif - - // Keep track of number of exhaustive calls (this frame in this thread). - ++(*x->ex_search_count_ptr); - -#if CONFIG_MULTITHREAD - if (NULL != x->search_count_mutex) - pthread_mutex_unlock(x->search_count_mutex); -#endif - // Trap illegal values for interval and range for this function. if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || (interval > range)) @@ -2367,32 +2355,6 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, return best_sad; } -#define MIN_EX_SEARCH_LIMIT 128 -static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) { - const SPEED_FEATURES *const sf = &cpi->sf; - int is_exhaustive_allowed; - int max_ex; - -#if CONFIG_MULTITHREAD - if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex); -#endif - - max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT, - (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); - - is_exhaustive_allowed = sf->allow_exhaustive_searches && - (sf->exhaustive_searches_thresh < INT_MAX) && - (*x->ex_search_count_ptr <= max_ex) && - !cpi->rc.is_src_frame_alt_ref; - -#if CONFIG_MULTITHREAD - if (NULL != x->search_count_mutex) - pthread_mutex_unlock(x->search_count_mutex); -#endif - - return is_exhaustive_allowed; -} - int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int search_method, int error_per_bit, int *cost_list, const MV *ref_mv, @@ -2435,21 +2397,9 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MAX_MVSEARCH_STEPS - 1 - step_param, 1, cost_list, fn_ptr, ref_mv, tmp_mv); -#if CONFIG_MULTITHREAD - if (NULL != x->search_count_mutex) - pthread_mutex_lock(x->search_count_mutex); -#endif - - // Keep track of number of searches (this frame in this thread). - ++(*x->m_search_count_ptr); - -#if CONFIG_MULTITHREAD - if (NULL != x->search_count_mutex) - pthread_mutex_unlock(x->search_count_mutex); -#endif - // Should we allow a follow on exhaustive search? - if (is_exhaustive_allowed(cpi, x)) { + if ((sf->exhaustive_searches_thresh < INT_MAX) && + !cpi->rc.is_src_frame_alt_ref) { int64_t exhuastive_thr = sf->exhaustive_searches_thresh; exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c index f5d8e430c8a..da06fb151d8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c @@ -110,24 +110,6 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { multi_thread_ctxt->num_tile_vert_sbs[tile_row] = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); } - -#if CONFIG_MULTITHREAD - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; - - CHECK_MEM_ERROR(cm, this_tile->search_count_mutex, - vpx_malloc(sizeof(*this_tile->search_count_mutex))); - - pthread_mutex_init(this_tile->search_count_mutex, NULL); - - CHECK_MEM_ERROR(cm, this_tile->enc_row_mt_mutex, - vpx_malloc(sizeof(*this_tile->enc_row_mt_mutex))); - - pthread_mutex_init(this_tile->enc_row_mt_mutex, NULL); - } - } -#endif } void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { @@ -170,12 +152,6 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { this_tile->row_base_thresh_freq_fact = NULL; } } - pthread_mutex_destroy(this_tile->search_count_mutex); - vpx_free(this_tile->search_count_mutex); - this_tile->search_count_mutex = NULL; - pthread_mutex_destroy(this_tile->enc_row_mt_mutex); - vpx_free(this_tile->enc_row_mt_mutex); - this_tile->enc_row_mt_mutex = NULL; } } #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c index fc2e32448e8..e2239b44b0f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -26,25 +26,27 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->level = kLowLow; ne->value = 0; ne->count = 0; - ne->thresh = 100; + ne->thresh = 90; ne->last_w = 0; ne->last_h = 0; if (width * height >= 1920 * 1080) { ne->thresh = 200; } else if (width * height >= 1280 * 720) { ne->thresh = 140; + } else if (width * height >= 640 * 360) { + ne->thresh = 100; } - ne->num_frames_estimate = 20; + ne->num_frames_estimate = 15; } static int enable_noise_estimation(VP9_COMP *const cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (cpi->common.use_highbitdepth) return 0; #endif -// Enable noise estimation if denoising is on, but not for low resolutions. +// Enable noise estimation if denoising is on. #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && - cpi->common.width >= 640 && cpi->common.height >= 360) + cpi->common.width >= 320 && cpi->common.height >= 180) return 1; #endif // Only allow noise estimate under certain encoding mode. @@ -97,6 +99,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { void vp9_update_noise_estimate(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); // Estimate of noise level every frame_period frames. int frame_period = 8; int thresh_consec_zeromv = 6; @@ -108,8 +111,17 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Estimate is between current source and last source. YV12_BUFFER_CONFIG *last_source = cpi->Last_Source; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) { last_source = &cpi->denoiser.last_source; + // Tune these thresholds for different resolutions when denoising is + // enabled. + if (cm->width > 640 && cm->width < 1920) { + thresh_consec_zeromv = 4; + thresh_sum_diff = 200; + thresh_sum_spatial = (120 * 120) << 8; + thresh_spatial_var = (48 * 48) << 8; + } + } #endif ne->enabled = enable_noise_estimation(cpi); if (cpi->svc.number_spatial_layers > 1) @@ -127,9 +139,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->last_h = cm->height; } return; - } else if (cpi->rc.avg_frame_low_motion < 50) { + } else if (cm->current_video_frame > 60 && + cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->svc.current_superframe > 1) { @@ -210,7 +225,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Avoid blocks with high brightness and high spatial variance. if ((sse2 - spatial_variance) < thresh_sum_spatial && spatial_variance < thresh_spatial_var) { - avg_est += variance / ((spatial_variance >> 9) + 1); + avg_est += low_res ? variance >> 4 + : variance / ((spatial_variance >> 9) + 1); num_samples++; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c index db2bbe7c272..b05f4184bd0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c @@ -170,6 +170,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + // Limit motion vector for large lightning change. + if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) { + x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10); + x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10); + x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10); + x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10); + } + assert(x->mv_best_ref_index[ref] <= 2); if (x->mv_best_ref_index[ref] < 2) mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; @@ -203,9 +211,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar); if (rv) { - const int subpel_force_stop = use_base_mv && cpi->sf.base_mv_aggressive - ? 2 - : cpi->sf.mv.subpel_force_stop; + const int subpel_force_stop = cpi->sf.mv.subpel_force_stop; cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, @@ -354,9 +360,11 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, *sse_y = sse; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && cpi->oxcf.speed > 5) + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5) ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, - (abs(sum) >> (bw + bh))); + (abs(sum) >> (bw + bh)), + cpi->svc.temporal_layer_id); else ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, cpi->common.height, abs(sum) >> (bw + bh)); @@ -452,28 +460,32 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, // Transform skipping test in UV planes. for (i = 1; i <= 2; i++) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); - const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); - const int uv_bw = b_width_log2_lookup[uv_bsize]; - const int uv_bh = b_height_log2_lookup[uv_bsize]; - const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + - (uv_bh - b_height_log2_lookup[unit_size]); - const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); - const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); - int j = i - 1; - - vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); - var_uv[j] = cpi->fn_ptr[uv_bsize].vf( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); - - if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && - (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) - skip_uv[j] = 1; - else - break; + if (cpi->oxcf.speed < 8 || x->color_sensitivity[i - 1]) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); + const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const int uv_bw = b_width_log2_lookup[uv_bsize]; + const int uv_bh = b_height_log2_lookup[uv_bsize]; + const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + + (uv_bh - b_height_log2_lookup[unit_size]); + const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + int j = i - 1; + + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + var_uv[j] = cpi->fn_ptr[uv_bsize].vf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); + + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; + } else { + skip_uv[i - 1] = 1; + } } // If the transform in YUV planes are skippable, the mode search checks @@ -481,7 +493,6 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, if (skip_uv[0] & skip_uv[1]) { *early_term = 1; } - return; } @@ -616,7 +627,7 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, int *skippable, int64_t *sse, BLOCK_SIZE bsize, - TX_SIZE tx_size) { + TX_SIZE tx_size, int rd_computed) { MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *pd = &xd->plane[0]; struct macroblock_plane *const p = &x->plane[0]; @@ -643,8 +654,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, bsize < BLOCK_32X32)) { unsigned int var_y, sse_y; (void)tx_size; - model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, - &var_y, &sse_y); + if (!rd_computed) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, + &var_y, &sse_y); *sse = INT_MAX; *skippable = 0; return; @@ -655,8 +667,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, bsize < BLOCK_32X32) { unsigned int var_y, sse_y; (void)tx_size; - model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, - &var_y, &sse_y); + if (!rd_computed) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, + &var_y, &sse_y); *sse = INT_MAX; *skippable = 0; return; @@ -978,7 +991,7 @@ static void estimate_block_intra(int plane, int block, int row, int col, int64_t this_sse = INT64_MAX; // TODO(jingning): This needs further refactoring. block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx, - VPXMIN(tx_size, TX_16X16)); + VPXMIN(tx_size, TX_16X16), 0); } else { unsigned int var = 0; unsigned int sse = 0; @@ -1151,8 +1164,8 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = { { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV } }; static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = { - { LAST_FRAME, ZEROMV }, { GOLDEN_FRAME, ZEROMV }, - { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, + { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, + { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV }, { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV } }; @@ -1216,7 +1229,8 @@ static INLINE void find_predictors( static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, PREDICTION_MODE this_mode, RD_COST *this_rdc, BLOCK_SIZE bsize, int mv_row, int mv_col, - int is_last_frame) { + int is_last_frame, int lowvar_highsumdiff, + int is_skin) { // Bias against MVs associated with NEWMV mode that are very different from // top/left neighbors. if (this_mode == NEWMV) { @@ -1263,9 +1277,12 @@ static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, // If noise estimation is enabled, and estimated level is above threshold, // add a bias to LAST reference with small motion, for large blocks. if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 && - is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) { - this_rdc->rdcost = 7 * this_rdc->rdcost >> 3; - } + is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); + else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 && + is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 && + mv_col > -16) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); } #if CONFIG_VP9_TEMPORAL_DENOISING @@ -1465,11 +1482,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int use_golden_nonzeromv = 1; int force_skip_low_temp_var = 0; int skip_ref_find_pred[4] = { 0 }; + unsigned int sse_zeromv_normalized = UINT_MAX; + unsigned int thresh_svc_skip_golden = 500; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; int denoise_svc_pickmode = 1; #endif + INTERP_FILTER filter_gf_svc = EIGHTTAP; init_ref_frame_cost(cm, xd, ref_frame_cost); @@ -1608,6 +1628,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int64_t this_sse; int is_skippable; int this_early_term = 0; + int rd_computed = 0; + PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; ref_frame = ref_mode_set[idx].ref_frame; @@ -1619,6 +1641,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (ref_frame > usable_ref_frame) continue; if (skip_ref_find_pred[ref_frame]) continue; + // For SVC, skip the golden (spatial) reference search if sse of zeromv_last + // is below threshold. + if (cpi->use_svc && ref_frame == GOLDEN_FRAME && + sse_zeromv_normalized < thresh_svc_skip_golden) + continue; + if (sf->short_circuit_flat_blocks && x->source_variance == 0 && this_mode != NEARESTMV) { continue; @@ -1715,15 +1743,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, &rd_thresh_freq_fact[mode_index])) || (!cpi->sf.adaptive_rd_thresh_row_mt && - rd_less_than_thresh( - best_rdc.rdcost, mode_rd_thresh, -#if CONFIG_MULTITHREAD - // Synchronization of this function - // is only necessary when - // adaptive_rd_thresh is > 0. - cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL, -#endif - &rd_thresh_freq_fact[mode_index]))) + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) continue; if (this_mode == NEWMV) { @@ -1835,12 +1856,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { int pf_rate[3]; int64_t pf_dist[3]; + int curr_rate[3]; unsigned int pf_var[3]; unsigned int pf_sse[3]; TX_SIZE pf_tx_size[3]; int64_t best_cost = INT64_MAX; INTERP_FILTER best_filter = SWITCHABLE, filter; PRED_BUFFER *current_pred = this_mode_pred; + rd_computed = 1; for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) { int64_t cost; @@ -1848,6 +1871,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], &pf_var[filter], &pf_sse[filter]); + curr_rate[filter] = pf_rate[filter]; pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); pf_tx_size[filter] = mi->tx_size; @@ -1873,7 +1897,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, mi->interp_filter = best_filter; mi->tx_size = pf_tx_size[best_filter]; - this_rdc.rate = pf_rate[best_filter]; + this_rdc.rate = curr_rate[best_filter]; this_rdc.dist = pf_dist[best_filter]; var_y = pf_var[best_filter]; sse_y = pf_sse[best_filter]; @@ -1887,6 +1911,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, ? bsize > BLOCK_32X32 : bsize >= BLOCK_32X32; mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; + + if (cpi->use_svc && ref_frame == GOLDEN_FRAME && + svc_force_zero_mode[ref_frame - 1]) + mi->interp_filter = filter_gf_svc; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); // For large partition blocks, extra testing is done. @@ -1897,15 +1926,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, &this_early_term); } else { + rd_computed = 1; model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y); } + // Save normalized sse (between current and last frame) for (0, 0) motion. + if (cpi->use_svc && ref_frame == LAST_FRAME && + frame_mv[this_mode][ref_frame].as_int == 0) { + sse_zeromv_normalized = + sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + } } if (!this_early_term) { this_sse = (int64_t)sse_y; block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize, - VPXMIN(mi->tx_size, TX_16X16)); + VPXMIN(mi->tx_size, TX_16X16), rd_computed); + x->skip_txfm[0] = is_skippable; if (is_skippable) { this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); @@ -1956,7 +1993,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize, frame_mv[this_mode][ref_frame].as_mv.row, frame_mv[this_mode][ref_frame].as_mv.col, - ref_frame == LAST_FRAME); + ref_frame == LAST_FRAME, x->lowvar_highsumdiff, + x->sb_is_skin); } // Skipping checking: test to see if this block can be reconstructed by @@ -2038,7 +2076,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (best_rdc.rdcost == INT64_MAX || ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad)) { + bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad && + !x->lowvar_highsumdiff)) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; int i; TX_SIZE best_intra_tx_size = TX_SIZES; @@ -2053,9 +2092,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - this_mode_pred->data, this_mode_pred->stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, + NULL, 0, NULL, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, @@ -2086,15 +2126,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, &rd_thresh_freq_fact[mode_index])) || (!cpi->sf.adaptive_rd_thresh_row_mt && - rd_less_than_thresh( - best_rdc.rdcost, mode_rd_thresh, -#if CONFIG_MULTITHREAD - // Synchronization of this function - // is only necessary when - // adaptive_rd_thresh is > 0. - cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL, -#endif - &rd_thresh_freq_fact[mode_index]))) + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) continue; mi->mode = this_mode; @@ -2162,9 +2195,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, - bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0, + bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh); @@ -2407,7 +2441,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_build_inter_predictor( - pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride, + CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf, 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0, vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c index f79b7c6fc27..27fea5d4e78 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c @@ -547,6 +547,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality) { const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int q = active_worst_quality; int last_error = INT_MAX; int i, target_bits_per_mb, bits_per_mb_at_this_q; @@ -561,7 +562,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, do { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - cpi->svc.temporal_layer_id == 0 && + cr->apply_cyclic_refresh && (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) { bits_per_mb_at_this_q = (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); @@ -2172,6 +2173,11 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { if (rate_err < 2.0 && !high_content) { rc->fac_active_worst_inter = 120; rc->fac_active_worst_gf = 90; + } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) { + // Increase active_worst faster at low Q if rate fluctuation is high. + rc->fac_active_worst_inter = 200; + if (rc->avg_frame_qindex[INTER_FRAME] < 8) + rc->fac_active_worst_inter = 400; } if (low_content && rc->avg_frame_low_motion > 80) { rc->af_ratio_onepass_vbr = 15; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c index 3c49fe665d4..39a7742f0f4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c @@ -650,15 +650,7 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { } void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, - int bsize, -#if CONFIG_MULTITHREAD - pthread_mutex_t *enc_row_mt_mutex, -#endif - int best_mode_index) { -#if CONFIG_MULTITHREAD - if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex); -#endif - + int bsize, int best_mode_index) { if (rd_thresh > 0) { const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; int mode; @@ -676,10 +668,6 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, } } } - -#if CONFIG_MULTITHREAD - if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex); -#endif } int vp9_get_intra_cost_penalty(int qindex, int qdelta, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h index aae47dcdda4..1e117686676 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h @@ -170,32 +170,11 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, -#if CONFIG_MULTITHREAD - pthread_mutex_t *enc_row_mt_mutex, -#endif int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, -#if CONFIG_MULTITHREAD - pthread_mutex_t *enc_row_mt_mutex, -#endif const int *const thresh_fact) { - int is_rd_less_than_thresh; - -#if CONFIG_MULTITHREAD - // Synchronize to ensure data coherency as thresh_freq_fact is maintained at - // tile level and not thread-safe with row based multi-threading - if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex); -#endif - - is_rd_less_than_thresh = - best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; - -#if CONFIG_MULTITHREAD - if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex); -#endif - - return is_rd_less_than_thresh; + return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; } static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c index d23d324466d..bf0fec3d8d8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c @@ -599,28 +599,28 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon = CONVERT_TO_BYTEPTR(recon); - vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, - bs, bs, xd->bd); + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, + 32, NULL, 0, NULL, 0, bs, bs, xd->bd); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } } + recon = CONVERT_TO_BYTEPTR(recon16); } else { #endif // CONFIG_VP9_HIGHBITDEPTH vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); @@ -1004,6 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int block = (row + idy) * 2 + (col + idx); const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); @@ -1025,7 +1026,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; - vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, + vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16, dst_stride, p->eobs[block], xd->bd); } else { int64_t unused; @@ -1048,7 +1049,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), - dst, dst_stride, p->eobs[block], xd->bd); + dst16, dst_stride, p->eobs[block], xd->bd); } } } @@ -1528,7 +1529,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_build_inter_predictor( - pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, + CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst), + pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2), xd->bd); @@ -1783,9 +1785,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); vp9_highbd_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, - &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); + CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride, + second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, + kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); } else { second_pred = (uint8_t *)second_pred_alloc_16; vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, @@ -3160,11 +3162,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; -#if CONFIG_MULTITHREAD - if (NULL != tile_data->enc_row_mt_mutex) - pthread_mutex_lock(tile_data->enc_row_mt_mutex); -#endif - for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; @@ -3186,11 +3183,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, memcpy(mode_map, tile_mode_map, sizeof(mode_map)); -#if CONFIG_MULTITHREAD - if (NULL != tile_data->enc_row_mt_mutex) - pthread_mutex_unlock(tile_data->enc_row_mt_mutex); -#endif - for (midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int mode_excluded = 0; @@ -3627,11 +3619,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (!cpi->rc.is_src_frame_alt_ref) vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, - sf->adaptive_rd_thresh, bsize, -#if CONFIG_MULTITHREAD - tile_data->enc_row_mt_mutex, -#endif - best_mode_index); + sf->adaptive_rd_thresh, bsize, best_mode_index); // macroblock modes *mi = best_mbmode; @@ -3771,11 +3759,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, (cm->interp_filter == mi->interp_filter)); vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, - cpi->sf.adaptive_rd_thresh, bsize, -#if CONFIG_MULTITHREAD - tile_data->enc_row_mt_mutex, -#endif - THR_ZEROMV); + cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV); vp9_zero(best_pred_diff); vp9_zero(best_filter_diff); @@ -3921,9 +3905,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!internal_active_edge && rd_less_than_thresh(best_rd, rd_opt->threshes[segment_id][bsize][ref_index], -#if CONFIG_MULTITHREAD - tile_data->enc_row_mt_mutex, -#endif &rd_thresh_freq_fact[ref_index])) continue; @@ -4373,11 +4354,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, !is_inter_block(&best_mbmode)); vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh, - bsize, -#if CONFIG_MULTITHREAD - tile_data->enc_row_mt_mutex, -#endif - best_ref_index); + bsize, best_ref_index); // macroblock modes *mi = best_mbmode; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c index f74b6b0e9e3..8d9e2e8c37f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c @@ -20,19 +20,14 @@ static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }; -#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +// Define 3 mesh density levels to control the number of searches. +#define MESH_DENSITY_LEVELS 3 static MESH_PATTERN - good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { - { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = { { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, - { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, - { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, }; -static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = { - 50, 25, 15, 5, 1, 1 -}; // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality @@ -163,14 +158,29 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed) { const int boosted = frame_is_boosted(cpi); + int i; sf->tx_size_search_breakout = 1; sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; sf->use_square_partition_only = !frame_is_boosted(cpi); sf->use_square_only_threshold = BLOCK_16X16; + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + sf->exhaustive_searches_thresh = (1 << 22); + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 0; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } else { + sf->exhaustive_searches_thresh = INT_MAX; + } + if (speed >= 1) { if (cpi->oxcf.pass == 2) { TWO_PASS *const twopass = &cpi->twopass; @@ -208,6 +218,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 30; + + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) + : INT_MAX; } if (speed >= 2) { @@ -229,6 +243,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->allow_partition_search_skip = 1; sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 1; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } } if (speed >= 3) { @@ -247,6 +271,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->adaptive_interp_filter_search = 1; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 2; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } } if (speed >= 4) { @@ -325,7 +359,6 @@ static void set_rt_speed_feature_framesize_independent( sf->adaptive_rd_thresh = 1; sf->adaptive_rd_thresh_row_mt = 0; sf->use_fast_coef_costing = 1; - sf->allow_exhaustive_searches = 0; sf->exhaustive_searches_thresh = INT_MAX; sf->allow_acl = 0; sf->copy_partition_flag = 0; @@ -498,7 +531,15 @@ static void set_rt_speed_feature_framesize_independent( // Enable short circuit for low temporal variance. sf->short_circuit_low_temp_var = 1; } - if (cpi->use_svc) sf->base_mv_aggressive = 1; + if (cpi->svc.temporal_layer_id > 0) { + sf->adaptive_rd_thresh = 4; + sf->limit_newmv_early_exit = 0; + sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2; + sf->base_mv_aggressive = + (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) + ? 1 + : 0; + } } if (speed >= 7) { @@ -523,9 +564,11 @@ static void set_rt_speed_feature_framesize_independent( if (speed >= 8) { sf->adaptive_rd_thresh = 4; - // Enable partition copy - if (!cpi->use_svc && !cpi->resize_pending && cpi->resize_state == ORIG && - !cpi->external_resize && cpi->oxcf.resize_mode == RESIZE_NONE) { + // Enable partition copy. For SVC, only enabled for top resolution layer, + if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && + !cpi->external_resize && + (!cpi->use_svc || + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { sf->copy_partition_flag = 1; cpi->max_copied_frame = 4; } @@ -533,7 +576,11 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->row_mt && cpi->oxcf.max_threads > 1) sf->adaptive_rd_thresh_row_mt = 1; - sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2; + if (content == VP9E_CONTENT_SCREEN) + sf->mv.subpel_force_stop = 3; + else if (cm->width * cm->height > 352 * 288) + sf->mv.subpel_force_stop = 2; + if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; // Only keep INTRA_DC mode for speed 8. if (!is_keyframe) { @@ -555,18 +602,13 @@ static void set_rt_speed_feature_framesize_independent( } // Since the short_circuit_low_temp_var is used, reduce the // adaptive_rd_thresh level. - if (cm->width > 320 && cm->height > 240) + if (cm->width * cm->height > 352 * 288) sf->adaptive_rd_thresh = 1; else sf->adaptive_rd_thresh = 2; } sf->limit_newmv_early_exit = 0; - if (cm->width > 320 && cm->height > 240) sf->use_simple_block_yrd = 1; - } - // Turn off adaptive_rd_thresh if row_mt is on for speed 5, 6, 7. - if (speed >= 5 && speed < 8 && cpi->row_mt && cpi->num_workers > 1) { - sf->adaptive_rd_thresh = 0; - sf->adaptive_rd_thresh_row_mt = 0; + sf->use_simple_block_yrd = 1; } } @@ -606,12 +648,11 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { // With row based multi-threading, the following speed features // have to be disabled to guarantee that bitstreams encoded with single thread - // and multiple threads match - if (cpi->oxcf.row_mt_bit_exact) { + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact) sf->adaptive_rd_thresh = 0; - sf->allow_exhaustive_searches = 0; - sf->adaptive_pred_interp_filter = 0; - } // This is only used in motion vector unit test. if (cpi->oxcf.motion_vector_unit_test == 1) @@ -711,6 +752,16 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->adaptive_rd_thresh = 1; sf->tx_size_search_breakout = 1; + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) + : INT_MAX; + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; + sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; + } + } + if (oxcf->mode == REALTIME) set_rt_speed_feature_framesize_independent(cpi, sf, oxcf->speed, oxcf->content); @@ -720,34 +771,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { cpi->full_search_sad = vp9_full_search_sad; cpi->diamond_search_sad = vp9_diamond_search_sad; - sf->allow_exhaustive_searches = 1; - if (oxcf->mode == BEST) { - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) - sf->exhaustive_searches_thresh = (1 << 20); - else - sf->exhaustive_searches_thresh = (1 << 21); - sf->max_exaustive_pct = 100; - for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; - sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; - } - } else { - int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) - sf->exhaustive_searches_thresh = (1 << 22); - else - sf->exhaustive_searches_thresh = (1 << 23); - sf->max_exaustive_pct = good_quality_max_mesh_pct[speed]; - if (speed > 0) - sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1; - - for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range; - sf->mesh_patterns[i].interval = - good_quality_mesh_patterns[speed][i].interval; - } - } - // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. if (oxcf->pass == 1) sf->optimize_coefficients = 0; @@ -782,12 +805,11 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { // With row based multi-threading, the following speed features // have to be disabled to guarantee that bitstreams encoded with single thread - // and multiple threads match - if (cpi->oxcf.row_mt_bit_exact) { + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact) sf->adaptive_rd_thresh = 0; - sf->allow_exhaustive_searches = 0; - sf->adaptive_pred_interp_filter = 0; - } // This is only used in motion vector unit test. if (cpi->oxcf.motion_vector_unit_test == 1) diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h index cbdf8bc3090..ee485a35f4d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h @@ -231,9 +231,11 @@ typedef struct SPEED_FEATURES { // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. + // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0). int adaptive_rd_thresh; - // Flag to use adaptive_rd_thresh when row-mt it enabled. + // Flag to use adaptive_rd_thresh when row-mt it enabled, only for non-rd + // pickmode. int adaptive_rd_thresh_row_mt; // Enables skipping the reconstruction step (idct, recon) in the @@ -325,15 +327,9 @@ typedef struct SPEED_FEATURES { // point for this motion search and limits the search range around it. int adaptive_motion_search; - // Flag for allowing some use of exhaustive searches; - int allow_exhaustive_searches; - // Threshold for allowing exhaistive motion search. int exhaustive_searches_thresh; - // Maximum number of exhaustive searches for a frame. - int max_exaustive_pct; - // Pattern to be used for any exhaustive mesh searches. MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c index 1d892dc148b..5867a6c38b8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -38,10 +38,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->current_superframe = 0; for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = 0; - cpi->svc.ext_lst_fb_idx[sl] = 0; - cpi->svc.ext_gld_fb_idx[sl] = 1; - cpi->svc.ext_alt_fb_idx[sl] = 2; + svc->ext_frame_flags[sl] = 0; + svc->ext_lst_fb_idx[sl] = 0; + svc->ext_gld_fb_idx[sl] = 1; + svc->ext_alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = EIGHTTAP; + svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter. } if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { @@ -650,15 +652,25 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); + // For low resolutions: set phase of the filter = 8 (for symmetric averaging + // filter), use bilinear for now. + if (width <= 320 && height <= 240) { + cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR; + cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8; + } + // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use - // of base motion vectors if spatial scale factors for any layers are not 2. + // of base motion vectors if spatial scale factors for any layers are not 2, + // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. if (cpi->svc.number_spatial_layers > 1) { int sl; for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id]; - if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) { + if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && + !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && + cpi->svc.number_spatial_layers == 3)) { cpi->svc.use_base_mv = 0; break; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h index ee7a6638b42..d8e6772b26f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -88,6 +88,13 @@ typedef struct { int force_zero_mode_spatial_ref; int current_superframe; int use_base_mv; + // Used to control the downscaling filter for source scaling, for 1 pass CBR. + // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + // = 8 will center the target pixel and get a symmetric averaging filter. + // downsample_filter_type: 4 filters may be used: eighttap_regular, + // eighttap_smooth, eighttap_sharp, and bilinear. + INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS]; + int downsample_filter_phase[VPX_SS_MAX_LAYERS]; } SVC; struct VP9_COMP; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c index 2b0307f8a11..63079415617 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -8,10 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <math.h> #include <limits.h> #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" @@ -53,16 +55,19 @@ static void temporal_filter_predictors_mb_c( #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, - 16, 16, which_mv, kernel, MV_PRECISION_Q3, - x, y, xd->bd); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, + CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv, + scale, 16, 16, which_mv, kernel, + MV_PRECISION_Q3, x, y, xd->bd); - vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[256]), uv_block_width, &mv, scale, uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y, xd->bd); - vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[512]), uv_block_width, &mv, scale, uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y, xd->bd); @@ -93,13 +98,19 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, + int filter_weight, uint32_t *accumulator, uint16_t *count) { unsigned int i, j, k; int modifier; int byte = 0; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + assert(strength >= 0); + assert(strength <= 6); + + assert(filter_weight >= 0); + assert(filter_weight <= 2); + for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { int pixel_value = *frame2; @@ -155,7 +166,7 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, void vp9_highbd_temporal_filter_apply_c( const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, uint16_t *count) { + int filter_weight, uint32_t *accumulator, uint16_t *count) { const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); unsigned int i, j, k; @@ -285,7 +296,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, unsigned int filter_weight; int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]); DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; @@ -332,8 +343,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, int stride; MV ref_mv; - memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); - memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + vp9_zero_array(accumulator, 16 * 16 * 3); + vp9_zero_array(count, 16 * 16 * 3); td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); td->mb.mv_limits.col_max = @@ -376,45 +387,44 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply_c( + vp9_highbd_temporal_filter_apply( f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, adj_strength, filter_weight, accumulator, count); - vp9_highbd_temporal_filter_apply_c( + vp9_highbd_temporal_filter_apply( f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, mb_uv_width, mb_uv_height, adj_strength, filter_weight, accumulator + 256, count + 256); - vp9_highbd_temporal_filter_apply_c( + vp9_highbd_temporal_filter_apply( f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, mb_uv_width, mb_uv_height, adj_strength, filter_weight, accumulator + 512, count + 512); } else { // Apply the filter (YUV) - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); - } -#else - // Apply the filter (YUV) - // TODO(jingning): Need SIMD optimization for this. - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 256, count + 256); - vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); + } +#else + // Apply the filter (YUV) + vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 512, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 512, + count + 512); #endif // CONFIG_VP9_HIGHBITDEPTH } } @@ -745,7 +755,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { "Failed to reallocate alt_ref_buffer"); } frames[frame] = vp9_scale_if_required( - cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0); + cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, + EIGHTTAP, 0); ++frame_used; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 00000000000..be4cd8685c5 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values +// which may be bound by the edges of the block being filtered. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 + +// Load values from 'a' and 'b'. Compute the difference squared and sum +// neighboring values such that: +// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 +// Values to the left and right of the row are set to 0. +// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. +static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { + const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); + const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); + + const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); + const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); + + // Shift all the values one place to the left/right so we can efficiently sum + // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. + const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); + const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); + + // It becomes necessary to treat the values as unsigned at this point. The + // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point + // forward since the filter is only applied to smooth small pixel changes. + // Once the value has saturated to uint16_t it is well outside the useful + // range. + __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum = sum_u16; +} + +static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, + __m128i *sum_1) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); + const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); + const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); + const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); + const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); + + const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); + const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); + const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); + const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); + + __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); + // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. + __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); + + __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum_0 = sum_u16; + + shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); + shift_right = _mm_srli_si128(diff_sq_1_u16, 2); + + sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum_1 = sum_u16; +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static __m128i average_8(__m128i sum, const __m128i mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, + const __m128i mul_constants_0, + const __m128i mul_constants_1, const int strength, + const int rounding, const int weight) { + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + __m128i input_0, input_1; + + input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); + input_0 = _mm_adds_epu16(input_0, rounding_u16); + + input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); + input_1 = _mm_adds_epu16(input_1, rounding_u16); + + input_0 = _mm_srl_epi16(input_0, strength_u128); + input_1 = _mm_srl_epi16(input_1, strength_u128); + + input_0 = _mm_min_epu16(input_0, sixteen); + input_1 = _mm_min_epu16(input_1, sixteen); + input_0 = _mm_sub_epi16(sixteen, input_0); + input_1 = _mm_sub_epi16(sixteen, input_1); + + *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); + *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, + const uint8_t *b, unsigned int width, + unsigned int height, int strength, + int weight, uint32_t *accumulator, + uint16_t *count) { + unsigned int h; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + + assert(strength >= 0); + assert(strength <= 6); + + assert(weight >= 0); + assert(weight <= 2); + + assert(width == 8 || width == 16); + + if (width == 8) { + __m128i sum_row_a, sum_row_b, sum_row_c; + __m128i mul_constants = _mm_setr_epi16( + NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + + sum_8(a, b, &sum_row_a); + sum_8(a + stride, b + width, &sum_row_b); + sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_c, b, count, accumulator); + + a += stride + stride; + b += width; + count += width; + accumulator += width; + + mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); + + for (h = 0; h < height - 2; ++h) { + sum_8(a, b + width, &sum_row_c); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); + sum_row_a = + average_8(sum_row_a, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_a, b, count, accumulator); + + a += stride; + b += width; + count += width; + accumulator += width; + + sum_row_a = sum_row_b; + sum_row_b = sum_row_c; + } + + mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_a, b, count, accumulator); + + } else { // width == 16 + __m128i sum_row_a_0, sum_row_a_1; + __m128i sum_row_b_0, sum_row_b_1; + __m128i sum_row_c_0, sum_row_c_1; + __m128i mul_constants_0 = _mm_setr_epi16( + NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), + mul_constants_1 = _mm_setr_epi16( + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + + sum_16(a, b, &sum_row_a_0, &sum_row_a_1); + sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); + + sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + + average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + + a += stride + stride; + b += width; + count += width; + accumulator += width; + + mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); + mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); + for (h = 0; h < height - 2; ++h) { + sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); + + sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); + sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); + + average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); + + a += stride; + b += width; + count += width; + accumulator += width; + + sum_row_a_0 = sum_row_b_0; + sum_row_a_1 = sum_row_b_1; + sum_row_b_0 = sum_row_c_0; + sum_row_b_1 = sum_row_c_1; + } + + mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); + mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + + average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c new file mode 100644 index 00000000000..e228bd8b7fa --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Usee of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" + +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_256, ssz_256; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_hi, ssz_hi; + __m128i sse_128, ssz_128; + int64_t sse; + const __m256i zero = _mm256_setzero_si256(); + + // If the block size is 16 then the results will fit in 32 bits. + if (block_size == 16) { + __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi; + // Load 16 elements for coeff and dqcoeff. + coeff_256 = load_tran_low(coeff); + dqcoeff_256 = load_tran_low(dqcoeff); + // dqcoeff - coeff + dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256); + // madd (dqcoeff - coeff) + dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); + // madd coeff + coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); + // Save the higher 64 bit of each 128 bit lane. + dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8); + coeff_hi = _mm256_srli_si256(coeff_256, 8); + // Add the higher 64 bit to the low 64 bit. + dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi); + coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero); + ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero); + } else { + int i; + assert(block_size % 32 == 0); + sse_256 = zero; + ssz_256 = zero; + + for (i = 0; i < block_size; i += 32) { + __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1; + // Load 32 elements for coeff and dqcoeff. + coeff_0 = load_tran_low(coeff + i); + dqcoeff_0 = load_tran_low(dqcoeff + i); + coeff_1 = load_tran_low(coeff + i + 16); + dqcoeff_1 = load_tran_low(dqcoeff + i + 16); + // dqcoeff - coeff + dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0); + dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1); + // madd (dqcoeff - coeff) + dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); + dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); + // madd coeff + coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); + coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); + // Add the first madd (dqcoeff - coeff) with the second. + dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1); + // Add the first madd (coeff) with the second. + coeff_0 = _mm256_add_epi32(coeff_0, coeff_1); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero); + // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi); + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + ssz_hi = _mm256_srli_si256(ssz_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256), + _mm256_extractf128_si256(ssz_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)(&sse), sse_128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_128); + return sse; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c deleted file mode 100644 index e39027f2536..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Usee of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <immintrin.h> // AVX2 - -#include "./vp9_rtcd.h" -#include "vpx/vpx_integer.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" - -int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; - __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; - __m256i sse_reg_64hi, ssz_reg_64hi; - __m128i sse_reg128, ssz_reg128; - int64_t sse; - int i; - const __m256i zero_reg = _mm256_set1_epi16(0); - - // init sse and ssz registerd to zero - sse_reg = _mm256_set1_epi16(0); - ssz_reg = _mm256_set1_epi16(0); - - for (i = 0; i < block_size; i += 16) { - // load 32 bytes from coeff and dqcoeff - coeff_reg = load_tran_low(coeff + i); - dqcoeff_reg = load_tran_low(dqcoeff + i); - // dqcoeff - coeff - dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); - // madd (dqcoeff - coeff) - dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); - // madd coeff - coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); - // expand each double word of madd (dqcoeff - coeff) to quad word - exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); - exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); - // expand each double word of madd (coeff) to quad word - exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); - exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); - // add each quad word of madd (dqcoeff - coeff) and madd (coeff) - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); - } - // save the higher 64 bit of each 128 bit lane - sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); - ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); - // add the higher 64 bit to the low 64 bit - sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); - ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); - - // add each 64 bit from each of the 128 bit lane of the 256 bit - sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), - _mm256_extractf128_si256(sse_reg, 1)); - - ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), - _mm256_extractf128_si256(ssz_reg, 1)); - - // store the results - _mm_storel_epi64((__m128i *)(&sse), sse_reg128); - - _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); - return sse; -} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm index 0a472ec7402..11d473b2dfa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -39,23 +39,18 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 ; accumulate in 64bit punpckldq m7, m0, m5 punpckhdq m0, m5 paddq m4, m7 - punpckldq m7, m1, m5 - paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m7 punpckldq m7, m2, m5 - paddq m4, m1 + paddq m4, m0 punpckhdq m2, m5 paddq m6, m7 - punpckldq m7, m3, m5 paddq m6, m2 - punpckhdq m3, m5 - paddq m6, m7 - paddq m6, m3 jg .loop ; accumulate horizontally and store in return value @@ -98,15 +93,13 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) pmaddwd m0, m0 pmaddwd m1, m1 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 ; accumulate in 64bit punpckldq m3, m0, m5 punpckhdq m0, m5 paddq m4, m3 - punpckldq m3, m1, m5 paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m3 - paddq m4, m1 jnz .loop ; accumulate horizontally and store in return value diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index fa2a6449b02..b53714a0289 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -16,7 +16,8 @@ #include "vpx_scale/yv12config.h" extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst); + YV12_BUFFER_CONFIG *dst, + uint8_t filter_type, int phase_scaler); static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, @@ -168,7 +169,8 @@ static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, } void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { + YV12_BUFFER_CONFIG *dst, + uint8_t filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; @@ -176,7 +178,7 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, const int dst_uv_w = dst_w / 2; const int dst_uv_h = dst_h / 2; - if (dst_w * 2 == src_w && dst_h * 2 == src_h) { + if (dst_w * 2 == src_w && dst_h * 2 == src_h && phase_scaler == 0) { downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, dst_h); downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, @@ -184,7 +186,7 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h); vpx_extend_frame_borders(dst); - } else if (dst_w == src_w * 2 && dst_h == src_h * 2) { + } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { // The upsample() supports widths up to 1920 * 2. If greater, fall back // to vp9_scale_and_extend_frame_c(). if (dst_w / 2 <= 1920) { @@ -196,9 +198,9 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, dst->uv_stride, dst_uv_w, dst_uv_h); vpx_extend_frame_borders(dst); } else { - vp9_scale_and_extend_frame_c(src, dst); + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } else { - vp9_scale_and_extend_frame_c(src, dst); + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm deleted file mode 100644 index 21aaa938318..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; void vp9_temporal_filter_apply_sse2 | arg -; (unsigned char *frame1, | 0 -; unsigned int stride, | 1 -; unsigned char *frame2, | 2 -; unsigned int block_width, | 3 -; unsigned int block_height, | 4 -; int strength, | 5 -; int filter_weight, | 6 -; unsigned int *accumulator, | 7 -; unsigned short *count) | 8 -global sym(vp9_temporal_filter_apply_sse2) PRIVATE -sym(vp9_temporal_filter_apply_sse2): - - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ALIGN_STACK 16, rax - %define block_width 0 - %define block_height 16 - %define strength 32 - %define filter_weight 48 - %define rounding_bit 64 - %define rbp_backup 80 - %define stack_size 96 - sub rsp, stack_size - mov [rsp + rbp_backup], rbp - ; end prolog - - mov edx, arg(3) - mov [rsp + block_width], rdx - mov edx, arg(4) - mov [rsp + block_height], rdx - movd xmm6, arg(5) - movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read - - ; calculate the rounding bit outside the loop - ; 0x8000 >> (16 - strength) - mov rdx, 16 - sub rdx, arg(5) ; 16 - strength - movq xmm4, rdx ; can't use rdx w/ shift - movdqa xmm5, [GLOBAL(_const_top_bit)] - psrlw xmm5, xmm4 - movdqa [rsp + rounding_bit], xmm5 - - mov rsi, arg(0) ; src/frame1 - mov rdx, arg(2) ; predictor frame - mov rdi, arg(7) ; accumulator - mov rax, arg(8) ; count - - ; dup the filter weight and store for later - movd xmm0, arg(6) ; filter_weight - pshuflw xmm0, xmm0, 0 - punpcklwd xmm0, xmm0 - movdqa [rsp + filter_weight], xmm0 - - mov rbp, arg(1) ; stride - pxor xmm7, xmm7 ; zero for extraction - - mov rcx, [rsp + block_width] - imul rcx, [rsp + block_height] - add rcx, rdx - cmp dword ptr [rsp + block_width], 8 - jne .temporal_filter_apply_load_16 - -.temporal_filter_apply_load_8: - movq xmm0, [rsi] ; first row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm0, xmm7 ; src[ 0- 7] - movq xmm1, [rsi] ; second row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp .temporal_filter_apply_load_finished - -.temporal_filter_apply_load_16: - movdqa xmm0, [rsi] ; src (frame1) - lea rsi, [rsi + rbp] ; += stride - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; src[ 0- 7] - punpckhbw xmm1, xmm7 ; src[ 8-15] - -.temporal_filter_apply_load_finished: - movdqa xmm2, [rdx] ; predictor (frame2) - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm7 ; pred[ 0- 7] - punpckhbw xmm3, xmm7 ; pred[ 8-15] - - ; modifier = src_byte - pixel_value - psubw xmm0, xmm2 ; src - pred[ 0- 7] - psubw xmm1, xmm3 ; src - pred[ 8-15] - - ; modifier *= modifier - pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 - pmullw xmm1, xmm1 ; modifer[ 8-15]^2 - - ; modifier *= 3 - pmullw xmm0, [GLOBAL(_const_3w)] - pmullw xmm1, [GLOBAL(_const_3w)] - - ; modifer += 0x8000 >> (16 - strength) - paddw xmm0, [rsp + rounding_bit] - paddw xmm1, [rsp + rounding_bit] - - ; modifier >>= strength - psrlw xmm0, [rsp + strength] - psrlw xmm1, [rsp + strength] - - ; modifier = 16 - modifier - ; saturation takes care of modifier > 16 - movdqa xmm3, [GLOBAL(_const_16w)] - movdqa xmm2, [GLOBAL(_const_16w)] - psubusw xmm3, xmm1 - psubusw xmm2, xmm0 - - ; modifier *= filter_weight - pmullw xmm2, [rsp + filter_weight] - pmullw xmm3, [rsp + filter_weight] - - ; count - movdqa xmm4, [rax] - movdqa xmm5, [rax+16] - ; += modifier - paddw xmm4, xmm2 - paddw xmm5, xmm3 - ; write back - movdqa [rax], xmm4 - movdqa [rax+16], xmm5 - lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) - - ; load and extract the predictor up to shorts - pxor xmm7, xmm7 - movdqa xmm0, [rdx] - lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; pred[ 0- 7] - punpckhbw xmm1, xmm7 ; pred[ 8-15] - - ; modifier *= pixel_value - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - ; expand to double words - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm7 ; [ 0- 3] - punpckhwd xmm2, xmm7 ; [ 4- 7] - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm7 ; [ 8-11] - punpckhwd xmm3, xmm7 ; [12-15] - - ; accumulator - movdqa xmm4, [rdi] - movdqa xmm5, [rdi+16] - movdqa xmm6, [rdi+32] - movdqa xmm7, [rdi+48] - ; += modifier - paddd xmm4, xmm0 - paddd xmm5, xmm2 - paddd xmm6, xmm1 - paddd xmm7, xmm3 - ; write back - movdqa [rdi], xmm4 - movdqa [rdi+16], xmm5 - movdqa [rdi+32], xmm6 - movdqa [rdi+48], xmm7 - lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) - - cmp rdx, rcx - je .temporal_filter_apply_epilog - pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_width], 16 - je .temporal_filter_apply_load_16 - jmp .temporal_filter_apply_load_8 - -.temporal_filter_apply_epilog: - ; begin epilog - mov rbp, [rsp + rbp_backup] - add rsp, stack_size - pop rsp - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -_const_3w: - times 8 dw 3 -align 16 -_const_top_bit: - times 8 dw 1<<15 -align 16 -_const_16w - times 8 dw 16 diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c index a335a4ab55d..25fc80a9a1e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c @@ -52,7 +52,6 @@ struct vp9_extracfg { int render_width; int render_height; unsigned int row_mt; - unsigned int row_mt_bit_exact; unsigned int motion_vector_unit_test; }; @@ -86,7 +85,6 @@ static struct vp9_extracfg default_extra_cfg = { 0, // render width 0, // render height 0, // row_mt - 0, // row_mt_bit_exact 0, // motion_vector_unit_test }; @@ -252,7 +250,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, "or kf_max_dist instead."); RANGE_CHECK(extra_cfg, row_mt, 0, 1); - RANGE_CHECK(extra_cfg, row_mt_bit_exact, 0, 1); RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); RANGE_CHECK(extra_cfg, cpu_used, -8, 8); @@ -564,7 +561,6 @@ static vpx_codec_err_t set_encoder_config( oxcf->target_level = extra_cfg->target_level; oxcf->row_mt = extra_cfg->row_mt; - oxcf->row_mt_bit_exact = extra_cfg->row_mt_bit_exact; oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { @@ -862,13 +858,6 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } -static vpx_codec_err_t ctrl_enable_row_mt_bit_exact(vpx_codec_alg_priv_t *ctx, - va_list args) { - struct vp9_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.row_mt_bit_exact = CAST(VP9E_ENABLE_ROW_MT_BIT_EXACT, args); - return update_extra_cfg(ctx, &extra_cfg); -} - static vpx_codec_err_t ctrl_enable_motion_vector_unit_test( vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; @@ -1633,7 +1622,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, { VP9E_SET_ROW_MT, ctrl_set_row_mt }, - { VP9E_ENABLE_ROW_MT_BIT_EXACT, ctrl_enable_row_mt_bit_exact }, { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, // Getters diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk index e0913bea3e6..47846c9410d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk @@ -100,7 +100,8 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c + VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) @@ -120,9 +121,10 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c endif -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c @@ -135,6 +137,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c index c2f80d88515..c774abb34f2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c @@ -436,6 +436,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2]; si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2]; } + if (svc_ctx->spatial_layers == 1) { + si->svc_params.scaling_factor_num[0] = 1; + si->svc_params.scaling_factor_den[0] = 1; + } } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h index b8ed0bb2e6a..ee6be4a249c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h @@ -555,15 +555,6 @@ enum vp8e_enc_control_id { */ VP9E_SET_ROW_MT, - /*!\brief Codec control function to enable bit-exact bitstream when row level - * multi-threading is enabled. - * - * 0 : off, 1 : on - * - * Supported in codecs: VP9 - */ - VP9E_ENABLE_ROW_MT_BIT_EXACT, - /*!\brief Codec control function to get bitstream level. * * Supported in codecs: VP9 @@ -867,9 +858,6 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int) #define VPX_CTRL_VP9E_SET_ROW_MT -VPX_CTRL_USE_TYPE(VP9E_ENABLE_ROW_MT_BIT_EXACT, unsigned int) -#define VPX_CTRL_VP9E_ENABLE_ROW_MT_BIT_EXACT - VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) #define VPX_CTRL_VP9E_GET_LEVEL diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c index cca9a932423..257e8ffee57 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c @@ -16,6 +16,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { const uint32x4_t a = vpaddlq_u16(v_16x8); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c new file mode 100644 index 00000000000..fe78f3f5138 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + int i; + // input[M * stride] * 16 + int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + input_0 = vadd_s16(input_0, one); + } + + for (i = 0; i < 2; ++i) { + const int16x8_t input_01 = vcombine_s16(input_0, input_1); + const int16x8_t input_32 = vcombine_s16(input_3, input_2); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64); + + const int32x4_t temp3 = + vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64); + const int32x4_t temp4 = + vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + input_0 = out_0; + input_1 = out_1; + input_2 = out_2; + input_3 = out_3; + } + + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c index 96f6de1be95..c449b466016 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c @@ -14,6 +14,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { @@ -125,6 +126,8 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 } // transpose 8x8 + // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 + // columns. { // 00 01 02 03 40 41 42 43 // 10 11 12 13 50 51 52 53 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c index ebeafed31fd..79bedd848a3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 1259bb3807b..98e42cd25ab 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -1268,10 +1268,8 @@ void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, } } -void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[16 * 16]; @@ -1313,10 +1311,8 @@ void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[16 * 16]; @@ -1349,10 +1345,8 @@ void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[4 * 16]; @@ -1414,7 +1408,7 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -1422,7 +1416,6 @@ void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c index 858342830d8..96a55c472f6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -386,15 +386,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out, } static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, - uint8_t *const dest, - const int stride, const int bd) { + uint16_t *dst, const int stride, + const int bd) { int i, idct32_pass_loop; int32_t trans_buf[32 * 8]; int32_t pass1[32 * 32]; int32_t pass2[32 * 32]; int32_t *out; int32x4x2_t q[16]; - uint16_t *dst = CONVERT_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; idct32_pass_loop++, input = pass1, out = pass2) { @@ -637,10 +636,10 @@ static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, } } -void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { if (bd == 8) { - vpx_idct32_32_neon(input, dest, stride, 1); + vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1); } else { vpx_highbd_idct32_32_neon(input, dest, stride, bd); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c index 52f3d43e5c4..3970a5a8613 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -726,10 +726,9 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 16]; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c index 195dcc92d5e..5d9063b15dc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -594,10 +594,9 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 8]; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c index d74331f8031..63eb49678cc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -59,7 +59,7 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -67,7 +67,6 @@ void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 128f72b9c96..20b09f68343 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -51,7 +51,7 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); const tran_low_t out0 = @@ -60,7 +60,6 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); @@ -133,14 +132,13 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, *a3 = vsubq_s32(b0, b3); } -void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); int32x4_t c0 = vld1q_s32(input); int32x4_t c1 = vld1q_s32(input + 4); int32x4_t c2 = vld1q_s32(input + 8); int32x4_t c3 = vld1q_s32(input + 12); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int16x8_t a0, a1; if (bd == 8) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c index f53f4c7fcad..6687e764959 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -36,7 +36,7 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -44,7 +44,6 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (a1 >= 0) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); @@ -292,9 +291,8 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, vst1q_u16(dest, d7_u16); } -void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int32x4_t a0 = vld1q_s32(input); int32x4_t a1 = vld1q_s32(input + 8); int32x4_t a2 = vld1q_s32(input + 16); @@ -553,9 +551,8 @@ static INLINE void idct8x8_64_half1d_bd12( *io7 = vsubq_s32(step1[0], step2[7]); } -void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int32x4_t a0 = vld1q_s32(input); int32x4_t a1 = vld1q_s32(input + 4); int32x4_t a2 = vld1q_s32(input + 8); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 1fde13e8d6d..74345e1facf 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -135,18 +135,16 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, return d; } -void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -336,20 +334,17 @@ void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, bd); + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -569,18 +564,16 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, } } -void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, // unused int x_step_q4, // unused const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); @@ -736,20 +729,17 @@ void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, // unused int x_step_q4, // unused const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h, bd); + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c index f4d70761eb3..4ff3dea085e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -13,14 +13,11 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; (void)filter_x_stride; (void)filter_y; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index a980ab1a380..61712d48e3c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -13,14 +13,11 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; (void)filter_x_stride; (void)filter_y; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c index 4e6e109920a..f769620a43b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -13,12 +13,11 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); @@ -29,23 +28,20 @@ void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ - vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), - src_stride, CONVERT_TO_BYTEPTR(temp), w, + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, - dst_stride, filter_x, x_step_q4, filter_y, - y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); @@ -55,11 +51,9 @@ void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), - src_stride, CONVERT_TO_BYTEPTR(temp), w, + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, - dst_stride, filter_x, x_step_q4, filter_y, - y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index 828fb5f6c71..5c5963d277e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c index b398259918a..021211bc990 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c index fc0c4cd8462..f3c336fa31f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c index 34b5baf7236..9f4589ea968 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -517,7 +518,7 @@ void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 int16_t *out; int16x8_t q[16]; - uint16_t *dst = CONVERT_TO_SHORTPTR(dest); + uint16_t *dst = CAST_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; idct32_pass_loop++, out = pass2) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c index d1eae24a222..21d21b03368 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -12,6 +12,7 @@ #include <assert.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/inv_txfm.h" static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c index bff98cbc169..673a36840e3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -13,13 +13,14 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const uint8_t *dst = dest; const int16x4_t cospis = vld1_s16(kCospi); - uint32x2_t dest01_u32 = vdup_n_u32(0); + uint8x8_t dest01_u8; uint32x2_t dest32_u32 = vdup_n_u32(0); int16x8_t a0, a1; uint8x8_t d01, d32; @@ -39,25 +40,22 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, a0 = vrshrq_n_s16(a0, 4); a1 = vrshrq_n_s16(a1, 4); - dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0); - dst += stride; - dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1); - dst += stride; + dest01_u8 = load_u8(dst, stride); + dst += 2 * stride; + // The elements are loaded in reverse order. dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); dst += stride; dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); - d01_u16 = - vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32)); + d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8); d32_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0); - dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1); - dest += stride; + store_u8(dest, stride, d01); + dest += 2 * stride; + // The elements are stored in reverse order. vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); dest += stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c index 279da67d74f..1121ade2796 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h index 27c784edca9..0fc1de8e491 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h @@ -41,58 +41,6 @@ DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = { }; //------------------------------------------------------------------------------ -// Helper functions used to load tran_low_t into int16, narrowing if necessary. - -static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4x2_t v0 = vld2q_s32(buf); - const int32x4x2_t v1 = vld2q_s32(buf + 8); - const int16x4_t s0 = vmovn_s32(v0.val[0]); - const int16x4_t s1 = vmovn_s32(v0.val[1]); - const int16x4_t s2 = vmovn_s32(v1.val[0]); - const int16x4_t s3 = vmovn_s32(v1.val[1]); - int16x8x2_t res; - res.val[0] = vcombine_s16(s0, s2); - res.val[1] = vcombine_s16(s1, s3); - return res; -#else - return vld2q_s16(buf); -#endif -} - -static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vld1q_s32(buf); - const int32x4_t v1 = vld1q_s32(buf + 4); - const int16x4_t s0 = vmovn_s32(v0); - const int16x4_t s1 = vmovn_s32(v1); - return vcombine_s16(s0, s1); -#else - return vld1q_s16(buf); -#endif -} - -static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vld1q_s32(buf); - return vmovn_s32(v0); -#else - return vld1_s16(buf); -#endif -} - -static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); - const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); - vst1q_s32(buf, v0); - vst1q_s32(buf + 4, v1); -#else - vst1q_s16(buf, a); -#endif -} - -//------------------------------------------------------------------------------ // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h new file mode 100644 index 00000000000..ba5c3d513d4 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_MEM_NEON_H_ +#define VPX_DSP_ARM_MEM_NEON_H_ + +#include <arm_neon.h> +#include <assert.h> +#include <string.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Helper functions used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + +// Load 2 sets of 4 bytes when alignment is guaranteed. +static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { + uint32x2_t a = vdup_n_u32(0); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + a = vld1_lane_u32((const uint32_t *)buf, a, 0); + buf += stride; + a = vld1_lane_u32((const uint32_t *)buf, a, 1); + return vreinterpret_u8_u32(a); +} + +// Store 2 sets of 4 bytes when alignment is guaranteed. +static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { + uint32x2_t a_u32 = vreinterpret_u32_u8(a); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + vst1_lane_u32((uint32_t *)buf, a_u32, 0); + buf += stride; + vst1_lane_u32((uint32_t *)buf, a_u32, 1); +} +#endif // VPX_DSP_ARM_MEM_NEON_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c index f044e11a155..9b1622ff038 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -22,12 +22,12 @@ static const uint8_t bilinear_filters[8][2] = { { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; +// Process a block exactly 8 wide and any height. static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { const uint8x8_t f0 = vmov_n_u8(filter[0]); const uint8x8_t f1 = vmov_n_u8(filter[1]); @@ -41,10 +41,11 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, vst1_u8(&output_ptr[0], out); // Next row... src_ptr += src_pixels_per_line; - output_ptr += output_width; + output_ptr += 8; } } +// Process a block which is a mutiple of 16 wide and any height. static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -73,61 +74,36 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, } } -unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); - DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); - - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters[yoffset]); - return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); - DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters[yoffset]); - return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); - DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters[yoffset]); - return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); +// TODO(johannkoenig): support 4xM block sizes. +#define sub_pixel_varianceNxM(n, m) \ + unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse) { \ + DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \ + DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \ + \ + if (n == 8) { \ + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \ + bilinear_filters[yoffset]); \ + } else { \ + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \ + bilinear_filters[yoffset]); \ + } \ + return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \ + } - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters[yoffset]); - return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); -} +sub_pixel_varianceNxM(8, 4); +sub_pixel_varianceNxM(8, 8); +sub_pixel_varianceNxM(8, 16); +sub_pixel_varianceNxM(16, 8); +sub_pixel_varianceNxM(16, 16); +sub_pixel_varianceNxM(16, 32); +sub_pixel_varianceNxM(32, 16); +sub_pixel_varianceNxM(32, 32); +sub_pixel_varianceNxM(32, 64); +sub_pixel_varianceNxM(64, 32); +sub_pixel_varianceNxM(64, 64); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c index b6d7f86a4b2..c0828e8f639 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c @@ -31,77 +31,129 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { return vget_lane_s32(c, 0); } -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { +// w * h must be less than 2048 or sum_s16 may overflow. +// Process a block of any size where the width is divisible by 16. +static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = - vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); - v_sse_hi = - vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); + for (j = 0; j < w; j += 16) { + const uint8x16_t a_u8 = vld1q_u8(a + j); + const uint8x16_t b_u8 = vld1q_u8(b + j); + + const uint16x8_t diff_lo_u16 = + vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); + const uint16x8_t diff_hi_u16 = + vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); + + const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); + const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); + + sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); + sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), + vget_low_s16(diff_lo_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), + vget_high_s16(diff_lo_s16)); + + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), + vget_low_s16(diff_hi_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), + vget_high_s16(diff_hi_s16)); } a += a_stride; b += b_stride; } - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); + *sum = horizontal_add_s16x8(sum_s16); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); +} + +// w * h must be less than 2048 or sum_s16 may overflow. +// Process a block of width 8 two rows at a time. +static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int h, uint32_t *sse, int *sum) { + int i = 0; + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); + + do { + const uint8x8_t a_0_u8 = vld1_u8(a); + const uint8x8_t a_1_u8 = vld1_u8(a + a_stride); + const uint8x8_t b_0_u8 = vld1_u8(b); + const uint8x8_t b_1_u8 = vld1_u8(b + b_stride); + const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); + const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); + const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); + const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); + sum_s16 = vaddq_s16(sum_s16, diff_0_s16); + sum_s16 = vaddq_s16(sum_s16, diff_1_s16); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), + vget_low_s16(diff_0_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), + vget_low_s16(diff_1_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), + vget_high_s16(diff_0_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), + vget_high_s16(diff_1_s16)); + a += a_stride + a_stride; + b += b_stride + b_stride; + i += 2; + } while (i < h); + + *sum = horizontal_add_s16x8(sum_s16); + *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); } void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); + variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); -} - -unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - ((sum * sum) >> 6); + variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum); } -unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); -} +#define varianceNxM(n, m, shift) \ + unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + if (n == 8) \ + variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \ + else \ + variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \ + if (n * m < 16 * 16) \ + return *sse - ((sum * sum) >> shift); \ + else \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } -unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} +varianceNxM(8, 4, 5); +varianceNxM(8, 8, 6); +varianceNxM(8, 16, 7); +varianceNxM(16, 8, 7); +varianceNxM(16, 16, 8); +varianceNxM(16, 32, 9); +varianceNxM(32, 16, 9); +varianceNxM(32, 32, 10); unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, - 32, 32, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride), + b_stride, 32, 32, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); @@ -112,9 +164,9 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); @@ -126,162 +178,24 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } -unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c index 210a9bed962..29323d1b899 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c @@ -1182,16 +1182,10 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, // Rows for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) + int16_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; + + if (zero_coeff) idct32_c(input, outptr); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -1290,7 +1284,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input, return 0; } -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ @@ -1299,7 +1293,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, tran_high_t a1, b1, c1, d1, e1; const tran_low_t *ip = input; tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); for (i = 0; i < 4; i++) { a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1348,14 +1341,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; const tran_low_t *ip = in; tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); (void)bd; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1452,13 +1444,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); } -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 4; ++i) { @@ -1478,13 +1469,12 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -1636,13 +1626,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); } -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 8; ++i) { @@ -1662,13 +1651,12 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows // Only first 4 row has non-zero coefs @@ -1689,13 +1677,12 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); @@ -2056,13 +2043,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 16; ++i) { @@ -2082,13 +2068,12 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. @@ -2111,13 +2096,12 @@ void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -2138,13 +2122,12 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -2531,26 +2514,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 32; ++i) { - tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) + tran_low_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; + + if (zero_coeff) highbd_idct32_c(input, outptr, bd); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -2569,13 +2545,12 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8); // Rows // Only upper-left 16x16 has non-zero coeff @@ -2598,13 +2573,12 @@ void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows // Only upper-left 8x8 has non-zero coeff @@ -2625,11 +2599,10 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; int a1; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c new file mode 100644 index 00000000000..6273460f190 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c @@ -0,0 +1,749 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, above); + int i; + (void)left; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(d, 0, dst); + } +} + +void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, above); + const uint8x16_t d1 = vec_vsx_ld(16, above); + int i; + (void)left; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(d0, 0, dst); + vec_vsx_st(d1, 16, dst); + } +} + +static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + +void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + (void)above; + + vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); +} + +void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + (void)above; + + vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst); +} + +void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + const uint8x16_t v8 = vec_splat(d, 8); + const uint8x16_t v9 = vec_splat(d, 9); + const uint8x16_t v10 = vec_splat(d, 10); + const uint8x16_t v11 = vec_splat(d, 11); + + const uint8x16_t v12 = vec_splat(d, 12); + const uint8x16_t v13 = vec_splat(d, 13); + const uint8x16_t v14 = vec_splat(d, 14); + const uint8x16_t v15 = vec_splat(d, 15); + + (void)above; + + vec_vsx_st(v0, 0, dst); + dst += stride; + vec_vsx_st(v1, 0, dst); + dst += stride; + vec_vsx_st(v2, 0, dst); + dst += stride; + vec_vsx_st(v3, 0, dst); + dst += stride; + vec_vsx_st(v4, 0, dst); + dst += stride; + vec_vsx_st(v5, 0, dst); + dst += stride; + vec_vsx_st(v6, 0, dst); + dst += stride; + vec_vsx_st(v7, 0, dst); + dst += stride; + vec_vsx_st(v8, 0, dst); + dst += stride; + vec_vsx_st(v9, 0, dst); + dst += stride; + vec_vsx_st(v10, 0, dst); + dst += stride; + vec_vsx_st(v11, 0, dst); + dst += stride; + vec_vsx_st(v12, 0, dst); + dst += stride; + vec_vsx_st(v13, 0, dst); + dst += stride; + vec_vsx_st(v14, 0, dst); + dst += stride; + vec_vsx_st(v15, 0, dst); +} + +#define H_PREDICTOR_32(v) \ + vec_vsx_st(v, 0, dst); \ + vec_vsx_st(v, 16, dst); \ + dst += stride + +void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, left); + const uint8x16_t d1 = vec_vsx_ld(16, left); + + const uint8x16_t v0_0 = vec_splat(d0, 0); + const uint8x16_t v1_0 = vec_splat(d0, 1); + const uint8x16_t v2_0 = vec_splat(d0, 2); + const uint8x16_t v3_0 = vec_splat(d0, 3); + const uint8x16_t v4_0 = vec_splat(d0, 4); + const uint8x16_t v5_0 = vec_splat(d0, 5); + const uint8x16_t v6_0 = vec_splat(d0, 6); + const uint8x16_t v7_0 = vec_splat(d0, 7); + const uint8x16_t v8_0 = vec_splat(d0, 8); + const uint8x16_t v9_0 = vec_splat(d0, 9); + const uint8x16_t v10_0 = vec_splat(d0, 10); + const uint8x16_t v11_0 = vec_splat(d0, 11); + const uint8x16_t v12_0 = vec_splat(d0, 12); + const uint8x16_t v13_0 = vec_splat(d0, 13); + const uint8x16_t v14_0 = vec_splat(d0, 14); + const uint8x16_t v15_0 = vec_splat(d0, 15); + + const uint8x16_t v0_1 = vec_splat(d1, 0); + const uint8x16_t v1_1 = vec_splat(d1, 1); + const uint8x16_t v2_1 = vec_splat(d1, 2); + const uint8x16_t v3_1 = vec_splat(d1, 3); + const uint8x16_t v4_1 = vec_splat(d1, 4); + const uint8x16_t v5_1 = vec_splat(d1, 5); + const uint8x16_t v6_1 = vec_splat(d1, 6); + const uint8x16_t v7_1 = vec_splat(d1, 7); + const uint8x16_t v8_1 = vec_splat(d1, 8); + const uint8x16_t v9_1 = vec_splat(d1, 9); + const uint8x16_t v10_1 = vec_splat(d1, 10); + const uint8x16_t v11_1 = vec_splat(d1, 11); + const uint8x16_t v12_1 = vec_splat(d1, 12); + const uint8x16_t v13_1 = vec_splat(d1, 13); + const uint8x16_t v14_1 = vec_splat(d1, 14); + const uint8x16_t v15_1 = vec_splat(d1, 15); + + (void)above; + + H_PREDICTOR_32(v0_0); + H_PREDICTOR_32(v1_0); + H_PREDICTOR_32(v2_0); + H_PREDICTOR_32(v3_0); + + H_PREDICTOR_32(v4_0); + H_PREDICTOR_32(v5_0); + H_PREDICTOR_32(v6_0); + H_PREDICTOR_32(v7_0); + + H_PREDICTOR_32(v8_0); + H_PREDICTOR_32(v9_0); + H_PREDICTOR_32(v10_0); + H_PREDICTOR_32(v11_0); + + H_PREDICTOR_32(v12_0); + H_PREDICTOR_32(v13_0); + H_PREDICTOR_32(v14_0); + H_PREDICTOR_32(v15_0); + + H_PREDICTOR_32(v0_1); + H_PREDICTOR_32(v1_1); + H_PREDICTOR_32(v2_1); + H_PREDICTOR_32(v3_1); + + H_PREDICTOR_32(v4_1); + H_PREDICTOR_32(v5_1); + H_PREDICTOR_32(v6_1); + H_PREDICTOR_32(v7_1); + + H_PREDICTOR_32(v8_1); + H_PREDICTOR_32(v9_1); + H_PREDICTOR_32(v10_1); + H_PREDICTOR_32(v11_1); + + H_PREDICTOR_32(v12_1); + H_PREDICTOR_32(v13_1); + H_PREDICTOR_32(v14_1); + H_PREDICTOR_32(v15_1); +} + +void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + uint8x16_t d; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); +} + +void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 4), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 5), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 6), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 7), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); +} + +static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, + int16x8_t ah, int16x8_t al, int16x8_t tl) { + int16x8_t vh, vl, ls; + + ls = vec_splat(l, 0); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 1); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 2); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 3); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 4); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 5); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 6); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 7); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); +} + +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l = vec_vsx_ld(0, left); + const int16x8_t lh = unpack_to_s16_h(l); + const int16x8_t ll = unpack_to_s16_l(l); + const uint8x16_t a = vec_vsx_ld(0, above); + const int16x8_t ah = unpack_to_s16_h(a); + const int16x8_t al = unpack_to_s16_l(a); + + tm_predictor_16x8(dst, stride, lh, ah, al, tl); + + dst += stride * 8; + + tm_predictor_16x8(dst, stride, ll, ah, al, tl); +} + +static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls, + const int16x8_t a0h, const int16x8_t a0l, + const int16x8_t a1h, const int16x8_t a1l, + const int16x8_t tl) { + int16x8_t vh, vl; + + vh = vec_sub(vec_add(ls, a0h), tl); + vl = vec_sub(vec_add(ls, a0l), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + vh = vec_sub(vec_add(ls, a1h), tl); + vl = vec_sub(vec_add(ls, a1l), tl); + vec_vsx_st(vec_packsu(vh, vl), 16, dst); +} + +static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride, + const int16x8_t l, const uint8x16_t a0, + const uint8x16_t a1, const int16x8_t tl) { + const int16x8_t a0h = unpack_to_s16_h(a0); + const int16x8_t a0l = unpack_to_s16_l(a0); + const int16x8_t a1h = unpack_to_s16_h(a1); + const int16x8_t a1l = unpack_to_s16_l(a1); + + tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl); +} + +void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl); +} + +static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 8; i++, dst += stride) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(val, d, 1), 0, dst); + } +} + +static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + } +} + +void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_16x16(dst, stride, v128); +} + +static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + vec_vsx_st(val, 16, dst); + } +} + +void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_32x32(dst, stride, v128); +} + +static uint8x16_t avg16(const uint8_t *values) { + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_16x16(dst, stride, avg16(left)); +} + +void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_16x16(dst, stride, avg16(above)); +} + +static uint8x16_t avg32(const uint8_t *values) { + const uint8x16_t v0 = vec_vsx_ld(0, values); + const uint8x16_t v1 = vec_vsx_ld(16, values); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_32x32(dst, stride, avg32(left)); +} + +void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_32x32(dst, stride, avg32(above)); +} + +static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left)); +} + +void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left)); +} + +static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5)); + const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0))); + const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left)); +} + +static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b, + const uint8x16_t c) { + const uint8x16_t ac = + vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1))); + + return vec_avg(ac, b); +} + +// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken. +static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 }; + +void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 7); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(row, d, 1), 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} + +void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(a, 15); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row, 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} + +void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 15); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0 = avg3(a0, b0, c0); + uint8x16_t row1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 32; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 16, dst); + dst += stride; + row0 = vec_perm(row0, row1, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 9); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a, b); + uint8x16_t row1 = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 4; i++) { + const uint8x16_t d0 = vec_vsx_ld(0, dst); + const uint8x16_t d1 = vec_vsx_ld(0, dst + stride); + vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst); + vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 0); + const uint8x16_t b = vec_perm(a0, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a0, b); + uint8x16_t row1 = avg3(a0, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t a2 = vec_vsx_ld(32, above); + const uint8x16_t above_right = vec_splat(a2, 0); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0_0 = vec_avg(a0, b0); + uint8x16_t row0_1 = vec_avg(a1, b1); + uint8x16_t row1_0 = avg3(a0, b0, c0); + uint8x16_t row1_1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row0_0, 0, dst); + vec_vsx_st(row0_1, 16, dst); + vec_vsx_st(row1_0, 0, dst + stride); + vec_vsx_st(row1_1, 16, dst + stride); + dst += stride * 2; + row0_0 = vec_perm(row0_0, row0_1, sl1); + row0_1 = vec_perm(row0_1, above_right, sl1); + row1_0 = vec_perm(row1_0, row1_1, sl1); + row1_1 = vec_perm(row1_1, above_right, sl1); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c new file mode 100644 index 00000000000..3edb40c3158 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "vpx_dsp/ppc/types_vsx.h" + +#include "vpx/vpx_integer.h" + +#define PROCESS16(offset) \ + v_a = vec_vsx_ld(offset, a); \ + v_b = vec_vsx_ld(offset, b); \ + v_ah = unpack_to_s16_h(v_a); \ + v_al = unpack_to_s16_l(v_a); \ + v_bh = unpack_to_s16_h(v_b); \ + v_bl = unpack_to_s16_l(v_b); \ + v_subh = vec_sub(v_ah, v_bh); \ + v_subl = vec_sub(v_al, v_bl); \ + v_absh = vec_abs(v_subh); \ + v_absl = vec_abs(v_subl); \ + v_sad = vec_sum4s(v_absh, v_sad); \ + v_sad = vec_sum4s(v_absl, v_sad); + +#define SAD16(height) \ + unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + int32x4_t v_sad = vec_splat_s32(0); \ + \ + for (y = 0; y < height; y++) { \ + PROCESS16(0); \ + \ + a += a_stride; \ + b += b_stride; \ + } \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + \ + return sad[3] + sad[2] + sad[1] + sad[0]; \ + } + +#define SAD32(height) \ + unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + int32x4_t v_sad = vec_splat_s32(0); \ + \ + for (y = 0; y < height; y++) { \ + PROCESS16(0); \ + PROCESS16(16); \ + \ + a += a_stride; \ + b += b_stride; \ + } \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + \ + return sad[3] + sad[2] + sad[1] + sad[0]; \ + } + +#define SAD64(height) \ + unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + int32x4_t v_sad = vec_splat_s32(0); \ + \ + for (y = 0; y < height; y++) { \ + PROCESS16(0); \ + PROCESS16(16); \ + PROCESS16(32); \ + PROCESS16(48); \ + \ + a += a_stride; \ + b += b_stride; \ + } \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + \ + return sad[3] + sad[2] + sad[1] + sad[0]; \ + } + +SAD16(8); +SAD16(16); +SAD16(32); +SAD32(16); +SAD32(32); +SAD32(64); +SAD64(32); +SAD64(64); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h index 2f3aa20495f..f611d02d2d5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h @@ -13,8 +13,56 @@ #include <altivec.h> +typedef vector signed char int8x16_t; +typedef vector unsigned char uint8x16_t; typedef vector signed short int16x8_t; typedef vector unsigned short uint16x8_t; typedef vector signed int int32x4_t; +typedef vector unsigned int uint32x4_t; + +#ifdef __clang__ +static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm) +#elif defined(__GNUC__) && \ + (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3)) +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif + +#ifdef WORDS_BIGENDIAN +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif +#else +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3) +#endif +#endif #endif // VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c new file mode 100644 index 00000000000..1efe2f00569 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static inline uint8x16_t read4x2(const uint8_t *a, int stride) { + const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); + const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); + + return (uint8x16_t)vec_mergeh(a0, a1); +} + +uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) { + int distortion; + + const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride)); + const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride)); + const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride)); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0))); + const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3); + + vec_ste(d, 0, &distortion); + + return distortion; +} + +// TODO(lu_zero): Unroll +uint32_t vpx_get_mb_ss_vsx(const int16_t *a) { + unsigned int i, sum = 0; + int32x4_t s = vec_splat_s32(0); + + for (i = 0; i < 256; i += 8) { + const int16x8_t v = vec_vsx_ld(0, a + i); + s = vec_msum(v, v, s); + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)s, 0, &sum); + + return sum; +} + +void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width >= 16) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref)); + vec_vsx_st(v, j, comp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + // Process 2 lines at time + for (i = 0; i < height / 2; ++i) { + const uint8x16_t r0 = vec_vsx_ld(0, ref); + const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride); + const uint8x16_t r = xxpermdi(r0, r1, 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 2; + pred += 16; // width * 2; + ref += ref_stride * 2; + } + } else { + assert(width == 4); + // process 4 lines at time + for (i = 0; i < height / 4; ++i) { + const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref); + const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride); + const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2); + const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3); + const uint8x16_t r = + (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 4; + pred += 16; // width * 4; + ref += ref_stride * 4; + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c new file mode 100644 index 00000000000..55dcdc2baf4 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <assert.h> +#include <string.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// TODO(lu_zero): unroll +static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 16: { + copy_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int i; + for (i = h; i--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} + +static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + vec_vsx_st(v, 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst)); + const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + vec_vsx_st(v2, 32, dst); + vec_vsx_st(v3, 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 16: { + avg_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h); + break; + } + } +} + +static inline void convolve_line(uint8_t *dst, const int16x8_t s, + const int16x8_t f) { + const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); + const int32x4_t bias = + vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); + const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS)); + const uint8x16_t v = vec_splat( + vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3); + vec_ste(v, 0, dst); +} + +static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x, + const int16_t *const x_filter) { + const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); + const int16x8_t f = vec_vsx_ld(0, x_filter); + + convolve_line(dst, s, f); +} + +// TODO(lu_zero): Implement 8x8 and bigger block special cases +static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + uint8_t v; + convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, + uint8x16_t c, uint8x16_t d, + uint8x16_t e, uint8x16_t f, + uint8x16_t g, uint8x16_t h) { + uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b); + uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d); + uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f); + uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h); + + uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd); + uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh); + + return (uint8x16_t)vec_mergeh(abcd, efgh); +} + +static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y, + ptrdiff_t src_stride, + const int16_t *const y_filter) { + uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); + uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); + uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); + uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride); + uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride); + uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride); + uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride); + uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride); + const int16x8_t f = vec_vsx_ld(0, y_filter); + uint8_t buf[16]; + const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7); + + vec_vsx_st(s, 0, buf); + + convolve_line(dst, unpack_to_s16_h(s), f); +} + +static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + convolve_line_v(dst + y * dst_stride, + &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + uint8_t v; + convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + x_filters, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + y_step_q4, w, h); + vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c index c80ef729bff..6ceb37e430b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c @@ -39,7 +39,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ return sad(src, src_stride, comp_pred, m, m, n); \ } @@ -178,7 +178,7 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - uint16_t comp_pred[m * n]; \ + DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \ vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c index 4214150251f..b1744047af1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> + #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -224,6 +226,9 @@ MSE(8, 8) void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { int i, j; + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c index cab6368e606..02c5a955a76 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c @@ -319,13 +319,11 @@ void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -343,13 +341,11 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -369,13 +365,11 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -395,13 +389,11 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -423,8 +415,8 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4, int x_step_q4, const InterpKernel *const y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { @@ -450,15 +442,14 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4, - x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), - 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, - bd); + temp, 64, x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -472,8 +463,8 @@ void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -487,8 +478,8 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -502,8 +493,8 @@ void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -517,8 +508,8 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -531,8 +522,8 @@ void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, filters_y, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -541,20 +532,18 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, - 0, NULL, 0, w, h, bd); + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, + filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h, + bd); } -void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int r; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; @@ -569,14 +558,12 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h index ee9744b3ae0..1aedd32bd4b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h @@ -24,8 +24,8 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, int h); #if CONFIG_VP9_HIGHBITDEPTH -typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk index 73c50fd3dd9..6ac7182abde 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk @@ -51,6 +51,7 @@ DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm @@ -95,6 +96,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c @@ -142,6 +144,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c +DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c + # loop filters DSP_SRCS-yes += loopfilter.c @@ -189,6 +193,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h +DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c @@ -227,6 +232,11 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) @@ -302,6 +312,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm +DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm @@ -320,9 +332,11 @@ DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c +DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm @@ -339,6 +353,7 @@ endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities +DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h # PPC VSX utilities @@ -346,6 +361,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h +# X86 utilities +DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5c2ba1cc541..410055077c5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -39,7 +39,7 @@ specialize qw/vpx_d63_predictor_4x4 ssse3/; add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/; +specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -57,7 +57,7 @@ specialize qw/vpx_v_predictor_4x4 neon msa sse2/; add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/; +specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/; @@ -75,13 +75,13 @@ add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vpx_d207_predictor_8x8 ssse3/; add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_8x8 neon sse2/; +specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/; add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_8x8 ssse3/; +specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/; add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; +specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -95,10 +95,10 @@ add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const specialize qw/vpx_v_predictor_8x8 neon msa sse2/; add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/; +specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/; +specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/; add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/; @@ -113,13 +113,13 @@ add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vpx_d207_predictor_16x16 ssse3/; add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_16x16 neon ssse3/; +specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/; add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_16x16 ssse3/; +specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/; add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -130,34 +130,34 @@ add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vpx_d153_predictor_16x16 ssse3/; add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_v_predictor_16x16 neon msa sse2/; +specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_16x16 neon msa sse2/; +specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/; +specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/; add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2/; +specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2/; +specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2/; +specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207_predictor_32x32 ssse3/; add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_32x32 neon ssse3/; +specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/; add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_32x32 ssse3/; +specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/; add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_32x32 neon msa sse2/; +specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -168,22 +168,22 @@ add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vpx_d153_predictor_32x32 ssse3/; add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_v_predictor_32x32 neon msa sse2/; +specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_32x32 neon msa sse2/; +specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_predictor_32x32 msa neon sse2/; +specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/; add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2/; +specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/; add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2/; +specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/; add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2/; +specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/; # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -332,28 +332,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Sub Pixel Filters # add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_copy neon dspr2 msa sse2/; +specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_avg neon dspr2 msa sse2/; +specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa/; +specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa/; +specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa/; +specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/; +specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/; add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/vpx_scaled_2d ssse3/; @@ -372,29 +372,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # - add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve_copy sse2 neon/; + add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve_avg sse2 neon/; + add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64"; + add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_horiz neon/, "$sse2_x86_64"; + add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_vert neon/, "$sse2_x86_64"; + add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_avg neon/, "$sse2_x86_64"; + add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_avg_horiz neon/, "$sse2_x86_64"; + add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_avg_vert neon/, "$sse2_x86_64"; + add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH # @@ -484,7 +484,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4 sse2/; + specialize qw/vpx_fdct4x4 neon sse2/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2/; @@ -532,7 +532,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4 sse2 msa/; + specialize qw/vpx_fdct4x4 neon sse2 msa/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2/; @@ -563,234 +563,106 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Inverse transform if (vpx_config("CONFIG_VP9") eq "yes") { + +add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; +add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + +if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH is off. + specialize qw/vpx_idct4x4_16_add neon sse2/; + specialize qw/vpx_idct4x4_1_add neon sse2/; + specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/; + specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/; + specialize qw/vpx_idct8x8_1_add neon sse2/; + specialize qw/vpx_idct16x16_256_add neon sse2/; + specialize qw/vpx_idct16x16_38_add neon sse2/; + $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; + specialize qw/vpx_idct16x16_10_add neon sse2/; + specialize qw/vpx_idct16x16_1_add neon sse2/; + specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/; + specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; + $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; + specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; + specialize qw/vpx_idct32x32_1_add neon sse2/; + + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { + # Note that these specializations appends to the above ones. + specialize qw/vpx_idct4x4_16_add dspr2 msa/; + specialize qw/vpx_idct4x4_1_add dspr2 msa/; + specialize qw/vpx_idct8x8_64_add dspr2 msa/; + specialize qw/vpx_idct8x8_12_add dspr2 msa/; + specialize qw/vpx_idct8x8_1_add dspr2 msa/; + specialize qw/vpx_idct16x16_256_add dspr2 msa/; + specialize qw/vpx_idct16x16_38_add dspr2 msa/; + $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2; + $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa; + specialize qw/vpx_idct16x16_10_add dspr2 msa/; + specialize qw/vpx_idct16x16_1_add dspr2 msa/; + specialize qw/vpx_idct32x32_1024_add dspr2 msa/; + specialize qw/vpx_idct32x32_135_add dspr2 msa/; + $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; + $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; + specialize qw/vpx_idct32x32_34_add dspr2 msa/; + specialize qw/vpx_idct32x32_1_add dspr2 msa/; + specialize qw/vpx_iwht4x4_16_add msa sse2/; + specialize qw/vpx_iwht4x4_1_add msa/; + } # !CONFIG_VP9_HIGHBITDEPTH +} # !CONFIG_EMULATE_HARDWARE + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_iwht4x4_16_add sse2/; - add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct4x4_1_add neon/; - add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct8x8_1_add neon/; - add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_1_add neon/; - add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_1_add neon sse2/; - add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - - add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - } else { - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct4x4_16_add neon sse2/; - - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct4x4_1_add neon sse2/; - - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/; - - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/; - - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_1_add neon sse2/; - - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_256_add neon sse2/; - - add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_38_add neon sse2/; - $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; - - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_10_add neon sse2/; - - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_1_add neon sse2/; - - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/; - - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; - # Need to add 135 eob idct32x32 implementations. - $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; - - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; + add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_1_add neon sse2/; - - add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_highbd_idct4x4_16_add neon sse2/; - - add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct8x8_64_add neon sse2/; - - add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct8x8_12_add neon sse2/; - - add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_256_add neon sse2/; - - add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_38_add neon sse2/; $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2; - - add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_10_add neon sse2/; - - add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_1024_add neon/; - - add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_135_add neon/; - - add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_34_add neon/; - } # CONFIG_EMULATE_HARDWARE -} else { - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - - add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - } else { - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_64_add sse2 ssse3 neon dspr2 msa/; - - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_12_add sse2 ssse3 neon dspr2 msa/; - - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_38_add sse2 neon dspr2 msa/; - $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; - $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2; - $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa; - - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/; - - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_135_add sse2 ssse3 neon dspr2 msa/; - $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; - $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; - $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; - - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_34_add sse2 ssse3 neon dspr2 msa/; - - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/; - - add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_iwht4x4_1_add msa/; - - add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_iwht4x4_16_add msa sse2/; - } # CONFIG_EMULATE_HARDWARE + } # !CONFIG_EMULATE_HARDWARE } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9 @@ -824,28 +696,28 @@ specialize qw/vpx_subtract_block neon msa sse2/; # Single block SAD # add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x64 avx2 neon msa sse2/; +specialize qw/vpx_sad64x64 avx2 neon msa sse2 vsx/; add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x32 avx2 msa sse2/; +specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x64 avx2 msa sse2/; +specialize qw/vpx_sad32x64 avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x32 avx2 neon msa sse2/; +specialize qw/vpx_sad32x32 avx2 neon msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x16 avx2 msa sse2/; +specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x32 msa sse2/; +specialize qw/vpx_sad16x32 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x16 neon msa sse2/; +specialize qw/vpx_sad16x16 neon msa sse2 vsx/; add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x8 neon msa sse2/; +specialize qw/vpx_sad16x8 neon msa sse2 vsx/; add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad8x16 neon msa sse2/; @@ -1249,10 +1121,10 @@ add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int sourc specialize qw/vpx_variance32x32 sse2 avx2 neon msa/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 msa/; + specialize qw/vpx_variance32x16 sse2 avx2 neon msa/; add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 msa/; + specialize qw/vpx_variance16x32 sse2 neon msa/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x16 sse2 avx2 neon msa/; @@ -1267,12 +1139,14 @@ add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_ specialize qw/vpx_variance8x8 sse2 neon msa/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 msa/; + specialize qw/vpx_variance8x4 sse2 neon msa/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +# TODO(johannkoenig): neon specialize qw/vpx_variance4x8 sse2 msa/; add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +# TODO(johannkoenig): neon specialize qw/vpx_variance4x4 sse2 msa/; # @@ -1297,12 +1171,13 @@ add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stri specialize qw/vpx_mse8x8 sse2 msa/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; - specialize qw/vpx_get_mb_ss sse2 msa/; + specialize qw/vpx_get_mb_ss sse2 msa vsx/; add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; - specialize qw/vpx_get4x4sse_cs neon msa/; + specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + specialize qw/vpx_comp_avg_pred sse2 vsx/; # # Subpixel Variance @@ -1311,34 +1186,34 @@ add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance64x32 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x64 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c new file mode 100644 index 00000000000..f83b26490e7 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + /* comp and pred must be 16 byte aligned. */ + assert(((intptr_t)comp & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); + const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); + const __m128i avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)(comp + x), avg); + } + comp += width; + pred += width; + ref += ref_stride; + } + } else { // width must be 4 or 8. + int i; + // Process 16 elements at a time. comp and pred have width == stride and + // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all + // divisible by 16 so just ref needs to be massaged when loading. + for (i = 0; i < width * height; i += 16) { + const __m128i p = _mm_load_si128((const __m128i *)pred); + __m128i r; + __m128i avg; + if (width == ref_stride) { + r = _mm_loadu_si128((const __m128i *)ref); + ref += 16; + } else if (width == 4) { + r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride), + *(const uint32_t *)(ref + 2 * ref_stride), + *(const uint32_t *)(ref + ref_stride), + *(const uint32_t *)(ref)); + + ref += 4 * ref_stride; + } else { + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + assert(width == 8); + r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), + (const __m64 *)(ref + ref_stride))); + + ref += 2 * ref_stride; + } + avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)comp, avg); + + pred += 16; + comp += 16; + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h index d7468ad7ca5..e69d6c61763 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h @@ -103,12 +103,10 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ void vpx_highbd_convolve8_##name##_##opt( \ - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ @@ -156,7 +154,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } \ if (w) { \ - vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ filter_x, x_step_q4, filter_y, \ y_step_q4, w, h, bd); \ } \ @@ -164,7 +162,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, #define HIGH_FUN_CONV_2D(avg, opt) \ void vpx_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ assert(w <= 64); \ @@ -172,20 +170,20 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, if (x_step_q4 == 16 && y_step_q4 == 16) { \ if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 7, bd); \ vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ + fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ } else { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ } \ } else { \ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 00000000000..2fc7b74303d --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + __m256i p0, p1, p2, p3, u0, u1, u2, u3; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); + u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); + _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); + + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + dst_stride), + _mm256_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadu_si128((const __m128i *)dst); + u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); + + _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); + _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadl_epi64((const __m128i *)dst); + u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); + + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); + _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } +} + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +#define CONV8_ROUNDING_BITS (7) + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void vpx_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void vpx_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void vpx_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void vpx_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void vpx_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +// Calculation with averaging the input pixels + +static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y0); + const __m128i a1 = _mm256_extractf128_si256(*y0, 1); + __m128i res = _mm_packus_epi32(a0, a1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); + const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); + const __m256i pix = + _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); + const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); + __m256i p = _mm256_min_epi16(*y0, *mask); + p = _mm256_avg_epu16(p, pix0); + _mm256_storeu_si256((__m256i *)dst, p); + + p = _mm256_min_epi16(*y1, *mask); + p = _mm256_avg_epu16(p, pix1); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, + const __m128i *y1, + const __m128i *mask, + uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, *mask); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 +#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 +#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 +#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 + +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +HIGH_FUN_CONV_2D(, avx2); + +void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +#define vpx_highbd_filter_block1d4_h8_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_h2_avg_avx2 \ + vpx_highbd_filter_block1d4_h2_avg_sse2 +#define vpx_highbd_filter_block1d4_v8_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_v2_avg_avx2 \ + vpx_highbd_filter_block1d4_v2_avg_sse2 + +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + avx2); +HIGH_FUN_CONV_2D(avg_, avx2); + +#undef HIGHBD_FUNC diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c new file mode 100644 index 00000000000..f16e4d07186 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_16x16(inptr, inptr + 16); + for (i = 0; i < 16; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 16; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i] = _mm_add_epi16(inptr[i], rounding); + inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); + inptr[i] = _mm_srai_epi16(inptr[i], 6); + inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // Since all non-zero dct coefficients are in upper-left 4x4 area, + // we only need to consider first 4 rows here. + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform (N.B. This transposes inptr) + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 16; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_8x8(inptr, inptr); + array_transpose_8x8(inptr + 8, inptr + 16); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i] = _mm_add_epi16(inptr[i], rounding); + inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); + inptr[i] = _mm_srai_epi16(inptr[i], 6); + inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c new file mode 100644 index 00000000000..bc9debf319c --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i dc_value, d; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + int a, i, j; + tran_low_t out; + + out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + a = ROUND_POWER_OF_TWO(out, 6); + + d = _mm_set1_epi32(a); + dc_value = _mm_packs_epi32(d, d); + for (i = 0; i < 32; ++i) { + for (j = 0; j < 4; ++j) { + d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); + d = _mm_adds_epi16(d, dc_value); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); + } + dest += stride; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c new file mode 100644 index 00000000000..3949ce92f89 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + transpose_16bit_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + vpx_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c new file mode 100644 index 00000000000..6a2e180646c --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_8x8(inptr, inptr); + for (i = 0; i < 8; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 8; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // only first 4 row has non-zero coefs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_4X8(inptr, inptr); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h new file mode 100644 index 00000000000..774cce1d40c --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ + +#include <emmintrin.h> // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index 8c33caedbd8..f75dab07aed 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -10,153 +10,36 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; + __m128i in[2]; // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8); + idct4_sse2(in); // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); + idct4_sse2(in); // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } + recon_and_store4x4_sse2(in, dest, stride); } void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; + __m128i dc_value, d[2]; a = (int)dct_const_round_shift(input[0] * cospi_16_64); a = (int)dct_const_round_shift(a * cospi_16_64); @@ -164,18 +47,26 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], dc_value); + d[1] = _mm_add_epi16(d[1], dc_value); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); } void idct4_sse2(__m128i *in) { @@ -186,7 +77,7 @@ void idct4_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; - transpose_4x4(in); + transpose_16bit_4x4(in); // stage 1 u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpackhi_epi16(in[0], in[1]); @@ -224,7 +115,7 @@ void iadst4_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8], in7; - transpose_4x4(in); + transpose_16bit_4x4(in); in7 = _mm_srli_si128(in[1], 8); in7 = _mm_add_epi16(in7, in[0]); in7 = _mm_sub_epi16(in7, in[1]); @@ -3349,595 +3240,3 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, RECON_AND_STORE(dest + 24 + j * stride, dc_value); } } - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - __m128i dc_value, d; - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - int a, i, j; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out; - - out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a = ROUND_POWER_OF_TWO(out, 6); - - d = _mm_set1_epi32(a); - dc_value = _mm_packs_epi32(d, d); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 4; ++j) { - d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); - d = _mm_adds_epi16(d, dc_value); - d = _mm_max_epi16(d, zero); - d = _mm_min_epi16(d, max); - _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); - } - dest += stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h index d5683ab1cf0..0460ab13bcb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -279,6 +279,34 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { res3 = _mm_packs_epi32(tmp6, tmp7); \ } +static INLINE void recon_and_store4x4_sse2(const __m128i *const in, + uint8_t *const dest, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d[2]; + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], in[0]); + d[1] = _mm_add_epi16(d[1], in[1]); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h new file mode 100644 index 00000000000..a5e40245a09 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#define VPX_DSP_X86_TRANSPOSE_SSE2_H_ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void transpose_16bit_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1, + __m128i *const a2, __m128i *const a3) { + // Unpack 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0: 00 10 01 11 + // b1: 20 30 21 31 + // b2: 02 12 03 13 + // b3: 22 32 23 33 + + const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1); + const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3); + const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1); + const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3); + + // Unpack 64 bit elements resulting in: + // a0: 00 10 20 30 + // a1: 01 11 21 31 + // a2: 02 12 22 32 + // a3: 03 13 23 33 + *a0 = _mm_unpacklo_epi64(b0, b1); + *a1 = _mm_unpackhi_epi64(b0, b1); + *a2 = _mm_unpacklo_epi64(b2, b3); + *a3 = _mm_unpackhi_epi64(b2, b3); +} + +#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index e2311c11670..389a692dbc9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -32,9 +32,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ mov r4d, dword wm %ifidn %2, highbd shl r4d, 1 - shl srcq, 1 shl src_strideq, 1 - shl dstq, 1 shl dst_strideq, 1 %else cmp r4d, 4 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h index 49954e90477..bfef783b133 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h @@ -35,8 +35,10 @@ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) +#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x))) #if CONFIG_VP9_HIGHBITDEPTH #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) +#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x))) #endif // CONFIG_VP9_HIGHBITDEPTH #if !defined(__has_feature) diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c index fa85ac8587c..6db2afb4aec 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c +++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c @@ -977,7 +977,7 @@ static int main_loop(int argc, const char **argv_) { if (do_md5) { update_image_md5(img, planes, &md5_ctx); } else { - write_image_file(img, planes, outfile); + if (!corrupted) write_image_file(img, planes, outfile); } } else { generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, diff --git a/chromium/third_party/libvpx/source/libvpx/webmdec.cc b/chromium/third_party/libvpx/source/libvpx/webmdec.cc index ed4bd700dd7..d609075a932 100644 --- a/chromium/third_party/libvpx/source/libvpx/webmdec.cc +++ b/chromium/third_party/libvpx/source/libvpx/webmdec.cc @@ -165,10 +165,11 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, } if (get_new_block) { block = block_entry->GetBlock(); + if (block == NULL) return -1; webm_ctx->block_frame_index = 0; } - } while (block->GetTrackNumber() != webm_ctx->video_track_index || - block_entry_eos); + } while (block_entry_eos || + block->GetTrackNumber() != webm_ctx->video_track_index); webm_ctx->cluster = cluster; webm_ctx->block_entry = block_entry; |